summaryrefslogtreecommitdiff
path: root/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c
diff options
context:
space:
mode:
Diffstat (limited to 'XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c')
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c620
1 files changed, 620 insertions, 0 deletions
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c
new file mode 100644
index 00000000..eda4da18
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c
@@ -0,0 +1,620 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_dgbmv2_s_d_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const double *head_x,
+ const double *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) double
+ *
+ * AB (input) float*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) double*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) double
+ *
+ * y (input) const double*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_dgbmv2_s_d_x";
+
+ switch (prec) {
+ case blas_prec_single:
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const float *a_i = a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double tmp1;
+ double tmp2;
+ double tmp3;
+ double tmp4;
+ double result;
+ double sum1;
+ double sum2;
+ double prod;
+ float a_elem;
+ double x_elem;
+ double y_elem;
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = x_elem * a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = x_elem * a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ tmp1 = sum1 * alpha_i;
+ tmp2 = sum2 * alpha_i;
+ tmp3 = tmp1 + tmp2;
+ y_elem = y_i[iy];
+ tmp4 = beta_i * y_elem;
+ result = tmp4 + tmp3;
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const float *a_i = a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double head_tmp1, tail_tmp1;
+ double head_tmp2, tail_tmp2;
+ double head_tmp3, tail_tmp3;
+ double head_tmp4, tail_tmp4;
+ double result;
+ double head_sum1, tail_sum1;
+ double head_sum2, tail_sum2;
+ double head_prod, tail_prod;
+ float a_elem;
+ double x_elem;
+ double y_elem;
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1 = tail_sum1 = 0.0;
+ head_sum2 = tail_sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ {
+ double dt = (double) a_elem;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = dt * split;
+ b1 = con - dt;
+ b1 = con - b1;
+ b2 = dt - b1;
+
+ head_prod = x_elem * dt;
+ tail_prod =
+ (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum1 + head_prod;
+ bv = s1 - head_sum1;
+ s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum1 + tail_prod;
+ bv = t1 - tail_sum1;
+ t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum1 = t1 + t2;
+ tail_sum1 = t2 - (head_sum1 - t1);
+ }
+ x_elem = tail_x_i[jx];
+ {
+ double dt = (double) a_elem;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = dt * split;
+ b1 = con - dt;
+ b1 = con - b1;
+ b2 = dt - b1;
+
+ head_prod = x_elem * dt;
+ tail_prod =
+ (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum2 + head_prod;
+ bv = s1 - head_sum2;
+ s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum2 + tail_prod;
+ bv = t1 - tail_sum2;
+ t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum2 = t1 + t2;
+ tail_sum2 = t2 - (head_sum2 - t1);
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum1 * split;
+ a11 = con - head_sum1;
+ a11 = con - a11;
+ a21 = head_sum1 - a11;
+ con = alpha_i * split;
+ b1 = con - alpha_i;
+ b1 = con - b1;
+ b2 = alpha_i - b1;
+
+ c11 = head_sum1 * alpha_i;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum1 * alpha_i;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_tmp1 = t1 + t2;
+ tail_tmp1 = t2 - (head_tmp1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum2 * split;
+ a11 = con - head_sum2;
+ a11 = con - a11;
+ a21 = head_sum2 - a11;
+ con = alpha_i * split;
+ b1 = con - alpha_i;
+ b1 = con - b1;
+ b2 = alpha_i - b1;
+
+ c11 = head_sum2 * alpha_i;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum2 * alpha_i;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_tmp2 = t1 + t2;
+ tail_tmp2 = t2 - (head_tmp2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_tmp1 + head_tmp2;
+ bv = s1 - head_tmp1;
+ s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_tmp1 + tail_tmp2;
+ bv = t1 - tail_tmp1;
+ t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_tmp3 = t1 + t2;
+ tail_tmp3 = t2 - (head_tmp3 - t1);
+ }
+ y_elem = y_i[iy];
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i * split;
+ a1 = con - beta_i;
+ a1 = con - a1;
+ a2 = beta_i - a1;
+ con = y_elem * split;
+ b1 = con - y_elem;
+ b1 = con - b1;
+ b2 = y_elem - b1;
+
+ head_tmp4 = beta_i * y_elem;
+ tail_tmp4 = (((a1 * b1 - head_tmp4) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_tmp4 + head_tmp3;
+ bv = s1 - head_tmp4;
+ s2 = ((head_tmp3 - bv) + (head_tmp4 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_tmp4 + tail_tmp3;
+ bv = t1 - tail_tmp4;
+ t2 = ((tail_tmp3 - bv) + (tail_tmp4 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result = t1 + t2;
+ }
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_dgbmv2_s_d_x */