diff options
63 files changed, 44789 insertions, 11 deletions
diff --git a/XBLAS/Makefile b/XBLAS/Makefile index c43bc1b3..5a006bfd 100644 --- a/XBLAS/Makefile +++ b/XBLAS/Makefile @@ -11,10 +11,10 @@ include make.conf include $(MAKEINC) -LIB = dot-lib sum-lib axpby-lib waxpby-lib gemv-lib ge_sum_mv-lib gbmv-lib symv-lib spmv-lib sbmv-lib hemv-lib hpmv-lib hbmv-lib trmv-lib tpmv-lib trsv-lib tbsv-lib gemm-lib symm-lib hemm-lib gemv2-lib symv2-lib hemv2-lib -TESTS = dot-test sum-test axpby-test waxpby-test gemv-test ge_sum_mv-test gbmv-test symv-test spmv-test sbmv-test hemv-test hpmv-test hbmv-test trmv-test tpmv-test trsv-test tbsv-test gemm-test symm-test hemm-test gemv2-test symv2-test hemv2-test -TEST_LIB = dot-test-lib sum-test-lib axpby-test-lib waxpby-test-lib gemv-test-lib ge_sum_mv-test-lib gbmv-test-lib symv-test-lib spmv-test-lib sbmv-test-lib hemv-test-lib hpmv-test-lib hbmv-test-lib trmv-test-lib tpmv-test-lib trsv-test-lib tbsv-test-lib gemm-test-lib symm-test-lib hemm-test-lib gemv2-test-lib symv2-test-lib hemv2-test-lib -CLEAN = dot-clean sum-clean axpby-clean waxpby-clean gemv-clean ge_sum_mv-clean gbmv-clean symv-clean spmv-clean sbmv-clean hemv-clean hpmv-clean hbmv-clean trmv-clean tpmv-clean trsv-clean tbsv-clean gemm-clean symm-clean hemm-clean gemv2-clean symv2-clean hemv2-clean dot2-clean +LIB = dot-lib sum-lib axpby-lib waxpby-lib gemv-lib ge_sum_mv-lib gbmv-lib symv-lib spmv-lib sbmv-lib hemv-lib hpmv-lib hbmv-lib trmv-lib tpmv-lib trsv-lib tbsv-lib gemm-lib symm-lib hemm-lib gemv2-lib symv2-lib hemv2-lib gbmv2-lib +TESTS = dot-test sum-test axpby-test waxpby-test gemv-test ge_sum_mv-test gbmv-test symv-test spmv-test sbmv-test hemv-test hpmv-test hbmv-test trmv-test tpmv-test trsv-test tbsv-test gemm-test symm-test hemm-test gemv2-test symv2-test hemv2-test gbmv2-test +TEST_LIB = dot-test-lib sum-test-lib axpby-test-lib waxpby-test-lib gemv-test-lib ge_sum_mv-test-lib gbmv-test-lib symv-test-lib spmv-test-lib sbmv-test-lib hemv-test-lib hpmv-test-lib hbmv-test-lib trmv-test-lib tpmv-test-lib trsv-test-lib tbsv-test-lib gemm-test-lib symm-test-lib hemm-test-lib gemv2-test-lib symv2-test-lib hemv2-test-lib gbmv2-test-lib +CLEAN = dot-clean sum-clean axpby-clean waxpby-clean gemv-clean ge_sum_mv-clean gbmv-clean symv-clean spmv-clean sbmv-clean hemv-clean hpmv-clean hbmv-clean trmv-clean tpmv-clean trsv-clean tbsv-clean gemm-clean symm-clean hemm-clean gemv2-clean symv2-clean hemv2-clean gbmv2-clean dot2-clean SRC_DIR = src TEST_DIR = testing @@ -52,6 +52,7 @@ tests: test-lib $(TESTS) cat $(TEST_DIR)/test-gemv2/gemv2.results >> testall.result cat $(TEST_DIR)/test-symv2/symv2.results >> testall.result cat $(TEST_DIR)/test-hemv2/hemv2.results >> testall.result + cat $(TEST_DIR)/test-gbmv2/gbmv2.results >> testall.result grep 'FAIL/TOTAL' testall.result >testall.summary cat testall.summary @@ -86,6 +87,7 @@ lib: $(LIB) cd $(SRC_DIR)/gemv2 && $(MAKE) lib cd $(SRC_DIR)/symv2 && $(MAKE) lib cd $(SRC_DIR)/hemv2 && $(MAKE) lib + cd $(SRC_DIR)/gbmv2 && $(MAKE) lib # custom test-dot2 stuff @@ -536,6 +538,25 @@ hemv2-clean: cd $(TEST_DIR)/test-hemv2 && $(MAKE) clean +# gbmv2 stuff + +gbmv2: gbmv2-test + + +gbmv2-lib: common-lib + cd $(SRC_DIR)/gbmv2 && $(MAKE) + +gbmv2-test-lib: lib common-test-lib + cd $(TEST_DIR)/test-gbmv2 && $(MAKE) do_test_gbmv2 + +gbmv2-test: gbmv2-test-lib + cd $(TEST_DIR)/test-gbmv2 && $(MAKE) + +gbmv2-clean: + cd $(SRC_DIR)/gbmv2 && $(MAKE) clean + cd $(TEST_DIR)/test-gbmv2 && $(MAKE) clean + + # Test library dependencies sum-test-lib: dot-test-lib diff --git a/XBLAS/src/blas_extended_proto.h b/XBLAS/src/blas_extended_proto.h index ba1497a0..3bdc3fde 100644 --- a/XBLAS/src/blas_extended_proto.h +++ b/XBLAS/src/blas_extended_proto.h @@ -1968,6 +1968,148 @@ void BLAS_zhemv2_z_d_x(enum blas_order_type order, enum blas_uplo_type uplo, enum blas_prec_type prec); +void BLAS_dgbmv2_d_s(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const double *a, int lda, const float *head_x, + const float *tail_x, int incx, double beta, + double *y, int incy); +void BLAS_dgbmv2_s_d(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const float *a, int lda, const double *head_x, + const double *tail_x, int incx, double beta, + double *y, int incy); +void BLAS_dgbmv2_s_s(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, double beta, + double *y, int incy); +void BLAS_zgbmv2_z_c(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy); +void BLAS_zgbmv2_c_z(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy); +void BLAS_zgbmv2_c_c(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy); +void BLAS_cgbmv2_c_s(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const float *head_x, + const float *tail_x, int incx, const void *beta, + void *y, int incy); +void BLAS_cgbmv2_s_c(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const float *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy); +void BLAS_cgbmv2_s_s(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, const void *beta, + void *y, int incy); +void BLAS_zgbmv2_z_d(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const double *head_x, + const double *tail_x, int incx, const void *beta, + void *y, int incy); +void BLAS_zgbmv2_d_z(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const double *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy); +void BLAS_zgbmv2_d_d(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const double *a, int lda, const double *head_x, + const double *tail_x, int incx, const void *beta, + void *y, int incy); +void BLAS_sgbmv2_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, float alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, float beta, + float *y, int incy, enum blas_prec_type prec); +void BLAS_dgbmv2_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const double *a, int lda, const double *head_x, + const double *tail_x, int incx, double beta, + double *y, int incy, enum blas_prec_type prec); +void BLAS_cgbmv2_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); +void BLAS_zgbmv2_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); +void BLAS_dgbmv2_d_s_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const double *a, int lda, const float *head_x, + const float *tail_x, int incx, double beta, + double *y, int incy, enum blas_prec_type prec); +void BLAS_dgbmv2_s_d_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const float *a, int lda, const double *head_x, + const double *tail_x, int incx, double beta, + double *y, int incy, enum blas_prec_type prec); +void BLAS_dgbmv2_s_s_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, double beta, + double *y, int incy, enum blas_prec_type prec); +void BLAS_zgbmv2_z_c_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); +void BLAS_zgbmv2_c_z_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); +void BLAS_zgbmv2_c_c_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); +void BLAS_cgbmv2_c_s_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const float *head_x, + const float *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); +void BLAS_cgbmv2_s_c_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const float *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); +void BLAS_cgbmv2_s_s_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); +void BLAS_zgbmv2_z_d_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const double *head_x, + const double *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); +void BLAS_zgbmv2_d_z_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const double *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); +void BLAS_zgbmv2_d_d_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const double *a, int lda, const double *head_x, + const double *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); + + int BLAS_fpinfo_x(enum blas_cmach_type cmach, enum blas_prec_type prec); void BLAS_error(const char *rname, int iflag, int ival, char *form, ...); diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s-f2c.c new file mode 100644 index 00000000..5f4f728a --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s-f2c.c @@ -0,0 +1,18 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_cgbmv2_c_s(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const float *head_x, + const float *tail_x, int incx, const void *beta, + void *y, int incy); + + +extern void FC_FUNC_(blas_cgbmv2_c_s, BLAS_CGBMV2_C_S) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const void *a, int *lda, const float *head_x, const float *tail_x, + int *incx, const void *beta, void *y, int *incy) { + BLAS_cgbmv2_c_s(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy); +} diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s.c new file mode 100644 index 00000000..3787bdba --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s.c @@ -0,0 +1,303 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_cgbmv2_c_s(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const float *head_x, + const float *tail_x, int incx, const void *beta, + void *y, int incy) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) void* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) float* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_cgbmv2_c_s"; + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = (float *) a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + float tmp1[2]; + float tmp2[2]; + float tmp3[2]; + float tmp4[2]; + float result[2]; + float sum1[2]; + float sum2[2]; + float prod[2]; + float a_elem[2]; + float x_elem; + float y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem = tail_x_i[jx]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem = tail_x_i[jx]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = sum1[0] * alpha_i[0] - sum1[1] * alpha_i[1]; + tmp1[1] = sum1[0] * alpha_i[1] + sum1[1] * alpha_i[0]; + } + + { + tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1]; + tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0]; + } + + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1]; + tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0]; + } + + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + +} /* end BLAS_cgbmv2_c_s */ diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x-f2c.c new file mode 100644 index 00000000..9f95e1ff --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_cgbmv2_c_s_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const float *head_x, + const float *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_cgbmv2_c_s_x, BLAS_CGBMV2_C_S_X) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const void *a, int *lda, const float *head_x, const float *tail_x, + int *incx, const void *beta, void *y, int *incy, int *prec) { + BLAS_cgbmv2_c_s_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, + *incy, (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x.c new file mode 100644 index 00000000..d4411041 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x.c @@ -0,0 +1,1516 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_cgbmv2_c_s_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const float *head_x, + const float *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) void* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) float* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_cgbmv2_c_s_x"; + + switch (prec) { + case blas_prec_single:{ + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = (float *) a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + float tmp1[2]; + float tmp2[2]; + float tmp3[2]; + float tmp4[2]; + float result[2]; + float sum1[2]; + float sum2[2]; + float prod[2]; + float a_elem[2]; + float x_elem; + float y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem = tail_x_i[jx]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem = tail_x_i[jx]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = sum1[0] * alpha_i[0] - sum1[1] * alpha_i[1]; + tmp1[1] = sum1[0] * alpha_i[1] + sum1[1] * alpha_i[0]; + } + + { + tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1]; + tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0]; + } + + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1]; + tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0]; + } + + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + + break; + } + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = (float *) a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + float result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + float a_elem[2]; + float x_elem; + float y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = (double) a_elem[0] * x_elem; + prod[1] = (double) a_elem[1] * x_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem = tail_x_i[jx]; + { + prod[0] = (double) a_elem[0] * x_elem; + prod[1] = (double) a_elem[1] * x_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = (double) a_elem[0] * x_elem; + prod[1] = (double) a_elem[1] * x_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem = tail_x_i[jx]; + { + prod[0] = (double) a_elem[0] * x_elem; + prod[1] = (double) a_elem[1] * x_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = + (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = + (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = + (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = + (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = (float *) a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + double head_tmp1[2], tail_tmp1[2]; + double head_tmp2[2], tail_tmp2[2]; + double head_tmp3[2], tail_tmp3[2]; + double head_tmp4[2], tail_tmp4[2]; + float result[2]; + double head_sum1[2], tail_sum1[2]; + double head_sum2[2], tail_sum2[2]; + double head_prod[2], tail_prod[2]; + float a_elem[2]; + float x_elem; + float y_elem[2]; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0; + head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + head_prod[0] = (double) a_elem[0] * x_elem; + tail_prod[0] = 0.0; + head_prod[1] = (double) a_elem[1] * x_elem; + tail_prod[1] = 0.0; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem = tail_x_i[jx]; + { + head_prod[0] = (double) a_elem[0] * x_elem; + tail_prod[0] = 0.0; + head_prod[1] = (double) a_elem[1] * x_elem; + tail_prod[1] = 0.0; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + head_prod[0] = (double) a_elem[0] * x_elem; + tail_prod[0] = 0.0; + head_prod[1] = (double) a_elem[1] * x_elem; + tail_prod[1] = 0.0; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem = tail_x_i[jx]; + { + head_prod[0] = (double) a_elem[0] * x_elem; + tail_prod[0] = 0.0; + head_prod[1] = (double) a_elem[1] * x_elem; + tail_prod[1] = 0.0; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + } + + { + double cd[2]; + cd[0] = (double) alpha_i[0]; + cd[1] = (double) alpha_i[1]; + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum1[0]; + tail_a0 = tail_sum1[0]; + head_a1 = head_sum1[1]; + tail_a1 = tail_sum1[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + c11 = head_a0 * cd[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * cd[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + c11 = head_a1 * cd[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * cd[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[0] = head_t1; + tail_tmp1[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + c11 = head_a1 * cd[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * cd[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + c11 = head_a0 * cd[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * cd[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[1] = head_t1; + tail_tmp1[1] = tail_t1; + } + + } + { + double cd[2]; + cd[0] = (double) alpha_i[0]; + cd[1] = (double) alpha_i[1]; + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum2[0]; + tail_a0 = tail_sum2[0]; + head_a1 = head_sum2[1]; + tail_a1 = tail_sum2[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + c11 = head_a0 * cd[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * cd[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + c11 = head_a1 * cd[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * cd[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[0] = head_t1; + tail_tmp2[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + c11 = head_a1 * cd[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * cd[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + c11 = head_a0 * cd[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * cd[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[1] = head_t1; + tail_tmp2[1] = tail_t1; + } + + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp1[0]; + tail_a = tail_tmp1[0]; + head_b = head_tmp2[0]; + tail_b = tail_tmp2[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[0] = head_t; + tail_tmp3[0] = tail_t; + /* Imaginary part */ + head_a = head_tmp1[1]; + tail_a = tail_tmp1[1]; + head_b = head_tmp2[1]; + tail_b = tail_tmp2[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[1] = head_t; + tail_tmp3[1] = tail_t; + } + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + double head_e1, tail_e1; + double d1; + double d2; + /* Real part */ + d1 = (double) beta_i[0] * y_elem[0]; + d2 = (double) -beta_i[1] * y_elem[1]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_tmp4[0] = head_e1; + tail_tmp4[0] = tail_e1; + /* imaginary part */ + d1 = (double) beta_i[0] * y_elem[1]; + d2 = (double) beta_i[1] * y_elem[0]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_tmp4[1] = head_e1; + tail_tmp4[1] = tail_e1; + } + { + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp4[0]; + tail_a = tail_tmp4[0]; + head_b = head_tmp3[0]; + tail_b = tail_tmp3[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[0] = t1 + t2; + } + /* Imaginary part */ + head_a = head_tmp4[1]; + tail_a = tail_tmp4[1]; + head_b = head_tmp3[1]; + tail_b = tail_tmp3[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[1] = t1 + t2; + } + } + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_cgbmv2_c_s_x */ diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c-f2c.c new file mode 100644 index 00000000..45b6160a --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c-f2c.c @@ -0,0 +1,18 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_cgbmv2_s_c(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const float *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy); + + +extern void FC_FUNC_(blas_cgbmv2_s_c, BLAS_CGBMV2_S_C) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const float *a, int *lda, const void *head_x, const void *tail_x, + int *incx, const void *beta, void *y, int *incy) { + BLAS_cgbmv2_s_c(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy); +} diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c.c new file mode 100644 index 00000000..f539a878 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c.c @@ -0,0 +1,281 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_cgbmv2_s_c(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const float *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) float* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) void* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_cgbmv2_s_c"; + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + float tmp1[2]; + float tmp2[2]; + float tmp3[2]; + float tmp4[2]; + float result[2]; + float sum1[2]; + float sum2[2]; + float prod[2]; + float a_elem; + float x_elem[2]; + float y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + + + + + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem = a_i[aij]; + { + prod[0] = x_elem[0] * a_elem; + prod[1] = x_elem[1] * a_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = x_elem[0] * a_elem; + prod[1] = x_elem[1] * a_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + + { + tmp1[0] = sum1[0] * alpha_i[0] - sum1[1] * alpha_i[1]; + tmp1[1] = sum1[0] * alpha_i[1] + sum1[1] * alpha_i[0]; + } + + { + tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1]; + tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0]; + } + + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1]; + tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0]; + } + + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + +} /* end BLAS_cgbmv2_s_c */ diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x-f2c.c new file mode 100644 index 00000000..7d484f5a --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_cgbmv2_s_c_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const float *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_cgbmv2_s_c_x, BLAS_CGBMV2_S_C_X) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const float *a, int *lda, const void *head_x, const void *tail_x, + int *incx, const void *beta, void *y, int *incy, int *prec) { + BLAS_cgbmv2_s_c_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, + *incy, (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x.c new file mode 100644 index 00000000..4a570797 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x.c @@ -0,0 +1,1304 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_cgbmv2_s_c_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const float *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) float* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) void* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_cgbmv2_s_c_x"; + + switch (prec) { + case blas_prec_single:{ + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + float tmp1[2]; + float tmp2[2]; + float tmp3[2]; + float tmp4[2]; + float result[2]; + float sum1[2]; + float sum2[2]; + float prod[2]; + float a_elem; + float x_elem[2]; + float y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + + + + + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem = a_i[aij]; + { + prod[0] = x_elem[0] * a_elem; + prod[1] = x_elem[1] * a_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = x_elem[0] * a_elem; + prod[1] = x_elem[1] * a_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + + { + tmp1[0] = sum1[0] * alpha_i[0] - sum1[1] * alpha_i[1]; + tmp1[1] = sum1[0] * alpha_i[1] + sum1[1] * alpha_i[0]; + } + + { + tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1]; + tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0]; + } + + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1]; + tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0]; + } + + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + + break; + } + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + float result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + float a_elem; + float x_elem[2]; + float y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + + + + + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem = a_i[aij]; + { + prod[0] = (double) x_elem[0] * a_elem; + prod[1] = (double) x_elem[1] * a_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = (double) x_elem[0] * a_elem; + prod[1] = (double) x_elem[1] * a_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + + { + tmp1[0] = + (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = + (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = + (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = + (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + double head_tmp1[2], tail_tmp1[2]; + double head_tmp2[2], tail_tmp2[2]; + double head_tmp3[2], tail_tmp3[2]; + double head_tmp4[2], tail_tmp4[2]; + float result[2]; + double head_sum1[2], tail_sum1[2]; + double head_sum2[2], tail_sum2[2]; + double head_prod[2], tail_prod[2]; + float a_elem; + float x_elem[2]; + float y_elem[2]; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + + + + + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0; + head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem = a_i[aij]; + { + head_prod[0] = (double) x_elem[0] * a_elem; + tail_prod[0] = 0.0; + head_prod[1] = (double) x_elem[1] * a_elem; + tail_prod[1] = 0.0; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + head_prod[0] = (double) x_elem[0] * a_elem; + tail_prod[0] = 0.0; + head_prod[1] = (double) x_elem[1] * a_elem; + tail_prod[1] = 0.0; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + + + { + double cd[2]; + cd[0] = (double) alpha_i[0]; + cd[1] = (double) alpha_i[1]; + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum1[0]; + tail_a0 = tail_sum1[0]; + head_a1 = head_sum1[1]; + tail_a1 = tail_sum1[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + c11 = head_a0 * cd[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * cd[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + c11 = head_a1 * cd[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * cd[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[0] = head_t1; + tail_tmp1[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + c11 = head_a1 * cd[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * cd[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + c11 = head_a0 * cd[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * cd[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[1] = head_t1; + tail_tmp1[1] = tail_t1; + } + + } + { + double cd[2]; + cd[0] = (double) alpha_i[0]; + cd[1] = (double) alpha_i[1]; + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum2[0]; + tail_a0 = tail_sum2[0]; + head_a1 = head_sum2[1]; + tail_a1 = tail_sum2[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + c11 = head_a0 * cd[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * cd[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + c11 = head_a1 * cd[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * cd[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[0] = head_t1; + tail_tmp2[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + c11 = head_a1 * cd[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * cd[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + c11 = head_a0 * cd[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * cd[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[1] = head_t1; + tail_tmp2[1] = tail_t1; + } + + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp1[0]; + tail_a = tail_tmp1[0]; + head_b = head_tmp2[0]; + tail_b = tail_tmp2[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[0] = head_t; + tail_tmp3[0] = tail_t; + /* Imaginary part */ + head_a = head_tmp1[1]; + tail_a = tail_tmp1[1]; + head_b = head_tmp2[1]; + tail_b = tail_tmp2[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[1] = head_t; + tail_tmp3[1] = tail_t; + } + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + double head_e1, tail_e1; + double d1; + double d2; + /* Real part */ + d1 = (double) beta_i[0] * y_elem[0]; + d2 = (double) -beta_i[1] * y_elem[1]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_tmp4[0] = head_e1; + tail_tmp4[0] = tail_e1; + /* imaginary part */ + d1 = (double) beta_i[0] * y_elem[1]; + d2 = (double) beta_i[1] * y_elem[0]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_tmp4[1] = head_e1; + tail_tmp4[1] = tail_e1; + } + { + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp4[0]; + tail_a = tail_tmp4[0]; + head_b = head_tmp3[0]; + tail_b = tail_tmp3[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[0] = t1 + t2; + } + /* Imaginary part */ + head_a = head_tmp4[1]; + tail_a = tail_tmp4[1]; + head_b = head_tmp3[1]; + tail_b = tail_tmp3[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[1] = t1 + t2; + } + } + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_cgbmv2_s_c_x */ diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s-f2c.c new file mode 100644 index 00000000..fddb2b00 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s-f2c.c @@ -0,0 +1,18 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_cgbmv2_s_s(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, const void *beta, + void *y, int incy); + + +extern void FC_FUNC_(blas_cgbmv2_s_s, BLAS_CGBMV2_S_S) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const float *a, int *lda, const float *head_x, const float *tail_x, + int *incx, const void *beta, void *y, int *incy) { + BLAS_cgbmv2_s_s(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy); +} diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s.c new file mode 100644 index 00000000..8e326907 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s.c @@ -0,0 +1,269 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_cgbmv2_s_s(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, const void *beta, + void *y, int incy) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) float* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) float* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_cgbmv2_s_s"; + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + float tmp1[2]; + float tmp2[2]; + float tmp3[2]; + float tmp4[2]; + float result[2]; + float sum1; + float sum2; + float prod; + float a_elem; + float x_elem; + float y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + + + + + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = x_elem * a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = x_elem * a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + { + tmp1[0] = alpha_i[0] * sum1; + tmp1[1] = alpha_i[1] * sum1; + } + { + tmp2[0] = alpha_i[0] * sum2; + tmp2[1] = alpha_i[1] * sum2; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1]; + tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0]; + } + + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + +} /* end BLAS_cgbmv2_s_s */ diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x-f2c.c new file mode 100644 index 00000000..f05811d8 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_cgbmv2_s_s_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_cgbmv2_s_s_x, BLAS_CGBMV2_S_S_X) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const float *a, int *lda, const float *head_x, const float *tail_x, + int *incx, const void *beta, void *y, int *incy, int *prec) { + BLAS_cgbmv2_s_s_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, + *incy, (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x.c new file mode 100644 index 00000000..0c7c8770 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x.c @@ -0,0 +1,946 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_cgbmv2_s_s_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) float* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) float* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_cgbmv2_s_s_x"; + + switch (prec) { + case blas_prec_single:{ + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + float tmp1[2]; + float tmp2[2]; + float tmp3[2]; + float tmp4[2]; + float result[2]; + float sum1; + float sum2; + float prod; + float a_elem; + float x_elem; + float y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + + + + + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = x_elem * a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = x_elem * a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + { + tmp1[0] = alpha_i[0] * sum1; + tmp1[1] = alpha_i[1] * sum1; + } + { + tmp2[0] = alpha_i[0] * sum2; + tmp2[1] = alpha_i[1] * sum2; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1]; + tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0]; + } + + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + + break; + } + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + float result[2]; + double sum1; + double sum2; + double prod; + float a_elem; + float x_elem; + float y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + + + + + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = (double) x_elem *a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = (double) x_elem *a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + { + tmp1[0] = alpha_i[0] * sum1; + tmp1[1] = alpha_i[1] * sum1; + } + { + tmp2[0] = alpha_i[0] * sum2; + tmp2[1] = alpha_i[1] * sum2; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + double head_tmp1[2], tail_tmp1[2]; + double head_tmp2[2], tail_tmp2[2]; + double head_tmp3[2], tail_tmp3[2]; + double head_tmp4[2], tail_tmp4[2]; + float result[2]; + double head_sum1, tail_sum1; + double head_sum2, tail_sum2; + double head_prod, tail_prod; + float a_elem; + float x_elem; + float y_elem[2]; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + + + + + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1 = tail_sum1 = 0.0; + head_sum2 = tail_sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + head_prod = (double) x_elem *a_elem; + tail_prod = 0.0; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum1 + head_prod; + bv = s1 - head_sum1; + s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum1 + tail_prod; + bv = t1 - tail_sum1; + t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum1 = t1 + t2; + tail_sum1 = t2 - (head_sum1 - t1); + } + x_elem = tail_x_i[jx]; + head_prod = (double) x_elem *a_elem; + tail_prod = 0.0; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum2 + head_prod; + bv = s1 - head_sum2; + s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum2 + tail_prod; + bv = t1 - tail_sum2; + t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum2 = t1 + t2; + tail_sum2 = t2 - (head_sum2 - t1); + } + aij += incaij; + jx += incx; + } + + + { + double head_e1, tail_e1; + double dt; + dt = (double) alpha_i[0]; + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum1 * split; + a11 = con - head_sum1; + a11 = con - a11; + a21 = head_sum1 - a11; + con = dt * split; + b1 = con - dt; + b1 = con - b1; + b2 = dt - b1; + + c11 = head_sum1 * dt; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum1 * dt; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_tmp1[0] = head_e1; + tail_tmp1[0] = tail_e1; + dt = (double) alpha_i[1]; + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum1 * split; + a11 = con - head_sum1; + a11 = con - a11; + a21 = head_sum1 - a11; + con = dt * split; + b1 = con - dt; + b1 = con - b1; + b2 = dt - b1; + + c11 = head_sum1 * dt; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum1 * dt; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_tmp1[1] = head_e1; + tail_tmp1[1] = tail_e1; + } + { + double head_e1, tail_e1; + double dt; + dt = (double) alpha_i[0]; + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum2 * split; + a11 = con - head_sum2; + a11 = con - a11; + a21 = head_sum2 - a11; + con = dt * split; + b1 = con - dt; + b1 = con - b1; + b2 = dt - b1; + + c11 = head_sum2 * dt; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum2 * dt; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_tmp2[0] = head_e1; + tail_tmp2[0] = tail_e1; + dt = (double) alpha_i[1]; + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum2 * split; + a11 = con - head_sum2; + a11 = con - a11; + a21 = head_sum2 - a11; + con = dt * split; + b1 = con - dt; + b1 = con - b1; + b2 = dt - b1; + + c11 = head_sum2 * dt; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum2 * dt; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_tmp2[1] = head_e1; + tail_tmp2[1] = tail_e1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp1[0]; + tail_a = tail_tmp1[0]; + head_b = head_tmp2[0]; + tail_b = tail_tmp2[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[0] = head_t; + tail_tmp3[0] = tail_t; + /* Imaginary part */ + head_a = head_tmp1[1]; + tail_a = tail_tmp1[1]; + head_b = head_tmp2[1]; + tail_b = tail_tmp2[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[1] = head_t; + tail_tmp3[1] = tail_t; + } + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + double head_e1, tail_e1; + double d1; + double d2; + /* Real part */ + d1 = (double) beta_i[0] * y_elem[0]; + d2 = (double) -beta_i[1] * y_elem[1]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_tmp4[0] = head_e1; + tail_tmp4[0] = tail_e1; + /* imaginary part */ + d1 = (double) beta_i[0] * y_elem[1]; + d2 = (double) beta_i[1] * y_elem[0]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_tmp4[1] = head_e1; + tail_tmp4[1] = tail_e1; + } + { + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp4[0]; + tail_a = tail_tmp4[0]; + head_b = head_tmp3[0]; + tail_b = tail_tmp3[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[0] = t1 + t2; + } + /* Imaginary part */ + head_a = head_tmp4[1]; + tail_a = tail_tmp4[1]; + head_b = head_tmp3[1]; + tail_b = tail_tmp3[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[1] = t1 + t2; + } + } + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_cgbmv2_s_s_x */ diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_x-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_x-f2c.c new file mode 100644 index 00000000..4f9fa6a1 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_cgbmv2_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_cgbmv2_x, BLAS_CGBMV2_X) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const void *a, int *lda, const void *head_x, const void *tail_x, int *incx, + const void *beta, void *y, int *incy, int *prec) { + BLAS_cgbmv2_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy, + (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_x.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_x.c new file mode 100644 index 00000000..2985002f --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_x.c @@ -0,0 +1,1688 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_cgbmv2_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) void* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) void* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_cgbmv2_x"; + + switch (prec) { + case blas_prec_single:{ + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = (float *) a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + float tmp1[2]; + float tmp2[2]; + float tmp3[2]; + float tmp4[2]; + float result[2]; + float sum1[2]; + float sum2[2]; + float prod[2]; + float a_elem[2]; + float x_elem[2]; + float y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = x_elem[0] * a_elem[0] - x_elem[1] * a_elem[1]; + prod[1] = x_elem[0] * a_elem[1] + x_elem[1] * a_elem[0]; + } + + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = x_elem[0] * a_elem[0] - x_elem[1] * a_elem[1]; + prod[1] = x_elem[0] * a_elem[1] + x_elem[1] * a_elem[0]; + } + + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = x_elem[0] * a_elem[0] - x_elem[1] * a_elem[1]; + prod[1] = x_elem[0] * a_elem[1] + x_elem[1] * a_elem[0]; + } + + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = x_elem[0] * a_elem[0] - x_elem[1] * a_elem[1]; + prod[1] = x_elem[0] * a_elem[1] + x_elem[1] * a_elem[0]; + } + + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = sum1[0] * alpha_i[0] - sum1[1] * alpha_i[1]; + tmp1[1] = sum1[0] * alpha_i[1] + sum1[1] * alpha_i[0]; + } + + { + tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1]; + tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0]; + } + + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1]; + tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0]; + } + + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + + break; + } + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = (float *) a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + float result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + float a_elem[2]; + float x_elem[2]; + float y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = + (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = + (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = + (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = + (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = (float *) y; + const float *a_i = (float *) a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + float *alpha_i = (float *) alpha; + float *beta_i = (float *) beta; + double head_tmp1[2], tail_tmp1[2]; + double head_tmp2[2], tail_tmp2[2]; + double head_tmp3[2], tail_tmp3[2]; + double head_tmp4[2], tail_tmp4[2]; + float result[2]; + double head_sum1[2], tail_sum1[2]; + double head_sum2[2], tail_sum2[2]; + double head_prod[2], tail_prod[2]; + float a_elem[2]; + float x_elem[2]; + float y_elem[2]; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0; + head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + double head_e1, tail_e1; + double d1; + double d2; + /* Real part */ + d1 = (double) x_elem[0] * a_elem[0]; + d2 = (double) -x_elem[1] * a_elem[1]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[0] = head_e1; + tail_prod[0] = tail_e1; + /* imaginary part */ + d1 = (double) x_elem[0] * a_elem[1]; + d2 = (double) x_elem[1] * a_elem[0]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[1] = head_e1; + tail_prod[1] = tail_e1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + double head_e1, tail_e1; + double d1; + double d2; + /* Real part */ + d1 = (double) x_elem[0] * a_elem[0]; + d2 = (double) -x_elem[1] * a_elem[1]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[0] = head_e1; + tail_prod[0] = tail_e1; + /* imaginary part */ + d1 = (double) x_elem[0] * a_elem[1]; + d2 = (double) x_elem[1] * a_elem[0]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[1] = head_e1; + tail_prod[1] = tail_e1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + double head_e1, tail_e1; + double d1; + double d2; + /* Real part */ + d1 = (double) x_elem[0] * a_elem[0]; + d2 = (double) -x_elem[1] * a_elem[1]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[0] = head_e1; + tail_prod[0] = tail_e1; + /* imaginary part */ + d1 = (double) x_elem[0] * a_elem[1]; + d2 = (double) x_elem[1] * a_elem[0]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[1] = head_e1; + tail_prod[1] = tail_e1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + double head_e1, tail_e1; + double d1; + double d2; + /* Real part */ + d1 = (double) x_elem[0] * a_elem[0]; + d2 = (double) -x_elem[1] * a_elem[1]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[0] = head_e1; + tail_prod[0] = tail_e1; + /* imaginary part */ + d1 = (double) x_elem[0] * a_elem[1]; + d2 = (double) x_elem[1] * a_elem[0]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[1] = head_e1; + tail_prod[1] = tail_e1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + } + + { + double cd[2]; + cd[0] = (double) alpha_i[0]; + cd[1] = (double) alpha_i[1]; + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum1[0]; + tail_a0 = tail_sum1[0]; + head_a1 = head_sum1[1]; + tail_a1 = tail_sum1[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + c11 = head_a0 * cd[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * cd[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + c11 = head_a1 * cd[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * cd[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[0] = head_t1; + tail_tmp1[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + c11 = head_a1 * cd[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * cd[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + c11 = head_a0 * cd[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * cd[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[1] = head_t1; + tail_tmp1[1] = tail_t1; + } + + } + { + double cd[2]; + cd[0] = (double) alpha_i[0]; + cd[1] = (double) alpha_i[1]; + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum2[0]; + tail_a0 = tail_sum2[0]; + head_a1 = head_sum2[1]; + tail_a1 = tail_sum2[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + c11 = head_a0 * cd[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * cd[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + c11 = head_a1 * cd[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * cd[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[0] = head_t1; + tail_tmp2[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + c11 = head_a1 * cd[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * cd[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + c11 = head_a0 * cd[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * cd[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[1] = head_t1; + tail_tmp2[1] = tail_t1; + } + + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp1[0]; + tail_a = tail_tmp1[0]; + head_b = head_tmp2[0]; + tail_b = tail_tmp2[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[0] = head_t; + tail_tmp3[0] = tail_t; + /* Imaginary part */ + head_a = head_tmp1[1]; + tail_a = tail_tmp1[1]; + head_b = head_tmp2[1]; + tail_b = tail_tmp2[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[1] = head_t; + tail_tmp3[1] = tail_t; + } + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + double head_e1, tail_e1; + double d1; + double d2; + /* Real part */ + d1 = (double) beta_i[0] * y_elem[0]; + d2 = (double) -beta_i[1] * y_elem[1]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_tmp4[0] = head_e1; + tail_tmp4[0] = tail_e1; + /* imaginary part */ + d1 = (double) beta_i[0] * y_elem[1]; + d2 = (double) beta_i[1] * y_elem[0]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_tmp4[1] = head_e1; + tail_tmp4[1] = tail_e1; + } + { + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp4[0]; + tail_a = tail_tmp4[0]; + head_b = head_tmp3[0]; + tail_b = tail_tmp3[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[0] = t1 + t2; + } + /* Imaginary part */ + head_a = head_tmp4[1]; + tail_a = tail_tmp4[1]; + head_b = head_tmp3[1]; + tail_b = tail_tmp3[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[1] = t1 + t2; + } + } + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_cgbmv2_x */ diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s-f2c.c new file mode 100644 index 00000000..52ed8941 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_dgbmv2_d_s(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const double *a, int lda, const float *head_x, + const float *tail_x, int incx, double beta, + double *y, int incy); + + +extern void FC_FUNC_(blas_dgbmv2_d_s, BLAS_DGBMV2_D_S) + + (int *trans, int *m, int *n, int *kl, int *ku, double *alpha, + const double *a, int *lda, const float *head_x, const float *tail_x, + int *incx, double *beta, double *y, int *incy) { + BLAS_dgbmv2_d_s(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y, + *incy); +} diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s.c new file mode 100644 index 00000000..0c70c784 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s.c @@ -0,0 +1,254 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_dgbmv2_d_s(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const double *a, int lda, const float *head_x, + const float *tail_x, int incx, double beta, + double *y, int incy) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) double + * + * AB (input) double* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) float* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) double + * + * y (input) const double* + * + * incy (input) int + * The stride for vector y. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_dgbmv2_d_s"; + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = y; + const double *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + double alpha_i = alpha; + double beta_i = beta; + double tmp1; + double tmp2; + double tmp3; + double tmp4; + double result; + double sum1; + double sum2; + double prod; + double a_elem; + float x_elem; + double y_elem; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = x_elem * a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = x_elem * a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + tmp1 = sum1 * alpha_i; + tmp2 = sum2 * alpha_i; + tmp3 = tmp1 + tmp2; + y_elem = y_i[iy]; + tmp4 = beta_i * y_elem; + result = tmp4 + tmp3; + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + +} /* end BLAS_dgbmv2_d_s */ diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x-f2c.c new file mode 100644 index 00000000..bca44d58 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_dgbmv2_d_s_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const double *a, int lda, const float *head_x, + const float *tail_x, int incx, double beta, + double *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_dgbmv2_d_s_x, BLAS_DGBMV2_D_S_X) + + (int *trans, int *m, int *n, int *kl, int *ku, double *alpha, + const double *a, int *lda, const float *head_x, const float *tail_x, + int *incx, double *beta, double *y, int *incy, int *prec) { + BLAS_dgbmv2_d_s_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y, + *incy, (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x.c new file mode 100644 index 00000000..bd27105e --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x.c @@ -0,0 +1,620 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_dgbmv2_d_s_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const double *a, int lda, const float *head_x, + const float *tail_x, int incx, double beta, + double *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) double + * + * AB (input) double* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) float* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) double + * + * y (input) const double* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_dgbmv2_d_s_x"; + + switch (prec) { + case blas_prec_single: + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = y; + const double *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + double alpha_i = alpha; + double beta_i = beta; + double tmp1; + double tmp2; + double tmp3; + double tmp4; + double result; + double sum1; + double sum2; + double prod; + double a_elem; + float x_elem; + double y_elem; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = x_elem * a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = x_elem * a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + tmp1 = sum1 * alpha_i; + tmp2 = sum2 * alpha_i; + tmp3 = tmp1 + tmp2; + y_elem = y_i[iy]; + tmp4 = beta_i * y_elem; + result = tmp4 + tmp3; + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = y; + const double *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + double alpha_i = alpha; + double beta_i = beta; + double head_tmp1, tail_tmp1; + double head_tmp2, tail_tmp2; + double head_tmp3, tail_tmp3; + double head_tmp4, tail_tmp4; + double result; + double head_sum1, tail_sum1; + double head_sum2, tail_sum2; + double head_prod, tail_prod; + double a_elem; + float x_elem; + double y_elem; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1 = tail_sum1 = 0.0; + head_sum2 = tail_sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + { + double dt = (double) x_elem; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = dt * split; + a1 = con - dt; + a1 = con - a1; + a2 = dt - a1; + con = a_elem * split; + b1 = con - a_elem; + b1 = con - b1; + b2 = a_elem - b1; + + head_prod = dt * a_elem; + tail_prod = + (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2; + } + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum1 + head_prod; + bv = s1 - head_sum1; + s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum1 + tail_prod; + bv = t1 - tail_sum1; + t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum1 = t1 + t2; + tail_sum1 = t2 - (head_sum1 - t1); + } + x_elem = tail_x_i[jx]; + { + double dt = (double) x_elem; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = dt * split; + a1 = con - dt; + a1 = con - a1; + a2 = dt - a1; + con = a_elem * split; + b1 = con - a_elem; + b1 = con - b1; + b2 = a_elem - b1; + + head_prod = dt * a_elem; + tail_prod = + (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2; + } + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum2 + head_prod; + bv = s1 - head_sum2; + s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum2 + tail_prod; + bv = t1 - tail_sum2; + t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum2 = t1 + t2; + tail_sum2 = t2 - (head_sum2 - t1); + } + aij += incaij; + jx += incx; + } + + + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum1 * split; + a11 = con - head_sum1; + a11 = con - a11; + a21 = head_sum1 - a11; + con = alpha_i * split; + b1 = con - alpha_i; + b1 = con - b1; + b2 = alpha_i - b1; + + c11 = head_sum1 * alpha_i; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum1 * alpha_i; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_tmp1 = t1 + t2; + tail_tmp1 = t2 - (head_tmp1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum2 * split; + a11 = con - head_sum2; + a11 = con - a11; + a21 = head_sum2 - a11; + con = alpha_i * split; + b1 = con - alpha_i; + b1 = con - b1; + b2 = alpha_i - b1; + + c11 = head_sum2 * alpha_i; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum2 * alpha_i; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_tmp2 = t1 + t2; + tail_tmp2 = t2 - (head_tmp2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_tmp1 + head_tmp2; + bv = s1 - head_tmp1; + s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_tmp1 + tail_tmp2; + bv = t1 - tail_tmp1; + t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_tmp3 = t1 + t2; + tail_tmp3 = t2 - (head_tmp3 - t1); + } + y_elem = y_i[iy]; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i * split; + a1 = con - beta_i; + a1 = con - a1; + a2 = beta_i - a1; + con = y_elem * split; + b1 = con - y_elem; + b1 = con - b1; + b2 = y_elem - b1; + + head_tmp4 = beta_i * y_elem; + tail_tmp4 = (((a1 * b1 - head_tmp4) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_tmp4 + head_tmp3; + bv = s1 - head_tmp4; + s2 = ((head_tmp3 - bv) + (head_tmp4 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_tmp4 + tail_tmp3; + bv = t1 - tail_tmp4; + t2 = ((tail_tmp3 - bv) + (tail_tmp4 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result = t1 + t2; + } + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_dgbmv2_d_s_x */ diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d-f2c.c new file mode 100644 index 00000000..3fc22560 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_dgbmv2_s_d(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const float *a, int lda, const double *head_x, + const double *tail_x, int incx, double beta, + double *y, int incy); + + +extern void FC_FUNC_(blas_dgbmv2_s_d, BLAS_DGBMV2_S_D) + + (int *trans, int *m, int *n, int *kl, int *ku, double *alpha, + const float *a, int *lda, const double *head_x, const double *tail_x, + int *incx, double *beta, double *y, int *incy) { + BLAS_dgbmv2_s_d(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y, + *incy); +} diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d.c new file mode 100644 index 00000000..8b840787 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d.c @@ -0,0 +1,254 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_dgbmv2_s_d(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const float *a, int lda, const double *head_x, + const double *tail_x, int incx, double beta, + double *y, int incy) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) double + * + * AB (input) float* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) double* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) double + * + * y (input) const double* + * + * incy (input) int + * The stride for vector y. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_dgbmv2_s_d"; + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = y; + const float *a_i = a; + const double *head_x_i = head_x; + const double *tail_x_i = tail_x; + double alpha_i = alpha; + double beta_i = beta; + double tmp1; + double tmp2; + double tmp3; + double tmp4; + double result; + double sum1; + double sum2; + double prod; + float a_elem; + double x_elem; + double y_elem; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = x_elem * a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = x_elem * a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + tmp1 = sum1 * alpha_i; + tmp2 = sum2 * alpha_i; + tmp3 = tmp1 + tmp2; + y_elem = y_i[iy]; + tmp4 = beta_i * y_elem; + result = tmp4 + tmp3; + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + +} /* end BLAS_dgbmv2_s_d */ diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x-f2c.c new file mode 100644 index 00000000..e02f9fd5 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_dgbmv2_s_d_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const float *a, int lda, const double *head_x, + const double *tail_x, int incx, double beta, + double *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_dgbmv2_s_d_x, BLAS_DGBMV2_S_D_X) + + (int *trans, int *m, int *n, int *kl, int *ku, double *alpha, + const float *a, int *lda, const double *head_x, const double *tail_x, + int *incx, double *beta, double *y, int *incy, int *prec) { + BLAS_dgbmv2_s_d_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y, + *incy, (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c new file mode 100644 index 00000000..eda4da18 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c @@ -0,0 +1,620 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_dgbmv2_s_d_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const float *a, int lda, const double *head_x, + const double *tail_x, int incx, double beta, + double *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) double + * + * AB (input) float* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) double* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) double + * + * y (input) const double* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_dgbmv2_s_d_x"; + + switch (prec) { + case blas_prec_single: + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = y; + const float *a_i = a; + const double *head_x_i = head_x; + const double *tail_x_i = tail_x; + double alpha_i = alpha; + double beta_i = beta; + double tmp1; + double tmp2; + double tmp3; + double tmp4; + double result; + double sum1; + double sum2; + double prod; + float a_elem; + double x_elem; + double y_elem; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = x_elem * a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = x_elem * a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + tmp1 = sum1 * alpha_i; + tmp2 = sum2 * alpha_i; + tmp3 = tmp1 + tmp2; + y_elem = y_i[iy]; + tmp4 = beta_i * y_elem; + result = tmp4 + tmp3; + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = y; + const float *a_i = a; + const double *head_x_i = head_x; + const double *tail_x_i = tail_x; + double alpha_i = alpha; + double beta_i = beta; + double head_tmp1, tail_tmp1; + double head_tmp2, tail_tmp2; + double head_tmp3, tail_tmp3; + double head_tmp4, tail_tmp4; + double result; + double head_sum1, tail_sum1; + double head_sum2, tail_sum2; + double head_prod, tail_prod; + float a_elem; + double x_elem; + double y_elem; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1 = tail_sum1 = 0.0; + head_sum2 = tail_sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + { + double dt = (double) a_elem; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = dt * split; + b1 = con - dt; + b1 = con - b1; + b2 = dt - b1; + + head_prod = x_elem * dt; + tail_prod = + (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2; + } + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum1 + head_prod; + bv = s1 - head_sum1; + s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum1 + tail_prod; + bv = t1 - tail_sum1; + t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum1 = t1 + t2; + tail_sum1 = t2 - (head_sum1 - t1); + } + x_elem = tail_x_i[jx]; + { + double dt = (double) a_elem; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = dt * split; + b1 = con - dt; + b1 = con - b1; + b2 = dt - b1; + + head_prod = x_elem * dt; + tail_prod = + (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2; + } + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum2 + head_prod; + bv = s1 - head_sum2; + s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum2 + tail_prod; + bv = t1 - tail_sum2; + t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum2 = t1 + t2; + tail_sum2 = t2 - (head_sum2 - t1); + } + aij += incaij; + jx += incx; + } + + + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum1 * split; + a11 = con - head_sum1; + a11 = con - a11; + a21 = head_sum1 - a11; + con = alpha_i * split; + b1 = con - alpha_i; + b1 = con - b1; + b2 = alpha_i - b1; + + c11 = head_sum1 * alpha_i; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum1 * alpha_i; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_tmp1 = t1 + t2; + tail_tmp1 = t2 - (head_tmp1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum2 * split; + a11 = con - head_sum2; + a11 = con - a11; + a21 = head_sum2 - a11; + con = alpha_i * split; + b1 = con - alpha_i; + b1 = con - b1; + b2 = alpha_i - b1; + + c11 = head_sum2 * alpha_i; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum2 * alpha_i; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_tmp2 = t1 + t2; + tail_tmp2 = t2 - (head_tmp2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_tmp1 + head_tmp2; + bv = s1 - head_tmp1; + s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_tmp1 + tail_tmp2; + bv = t1 - tail_tmp1; + t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_tmp3 = t1 + t2; + tail_tmp3 = t2 - (head_tmp3 - t1); + } + y_elem = y_i[iy]; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i * split; + a1 = con - beta_i; + a1 = con - a1; + a2 = beta_i - a1; + con = y_elem * split; + b1 = con - y_elem; + b1 = con - b1; + b2 = y_elem - b1; + + head_tmp4 = beta_i * y_elem; + tail_tmp4 = (((a1 * b1 - head_tmp4) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_tmp4 + head_tmp3; + bv = s1 - head_tmp4; + s2 = ((head_tmp3 - bv) + (head_tmp4 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_tmp4 + tail_tmp3; + bv = t1 - tail_tmp4; + t2 = ((tail_tmp3 - bv) + (tail_tmp4 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result = t1 + t2; + } + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_dgbmv2_s_d_x */ diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s-f2c.c new file mode 100644 index 00000000..f8d78cd8 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_dgbmv2_s_s(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, double beta, + double *y, int incy); + + +extern void FC_FUNC_(blas_dgbmv2_s_s, BLAS_DGBMV2_S_S) + + (int *trans, int *m, int *n, int *kl, int *ku, double *alpha, + const float *a, int *lda, const float *head_x, const float *tail_x, + int *incx, double *beta, double *y, int *incy) { + BLAS_dgbmv2_s_s(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y, + *incy); +} diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s.c new file mode 100644 index 00000000..fa29f394 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s.c @@ -0,0 +1,254 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_dgbmv2_s_s(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, double beta, + double *y, int incy) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) double + * + * AB (input) float* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) float* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) double + * + * y (input) const double* + * + * incy (input) int + * The stride for vector y. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_dgbmv2_s_s"; + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = y; + const float *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + double alpha_i = alpha; + double beta_i = beta; + double tmp1; + double tmp2; + double tmp3; + double tmp4; + double result; + double sum1; + double sum2; + double prod; + float a_elem; + float x_elem; + double y_elem; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = (double) x_elem *a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = (double) x_elem *a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + tmp1 = sum1 * alpha_i; + tmp2 = sum2 * alpha_i; + tmp3 = tmp1 + tmp2; + y_elem = y_i[iy]; + tmp4 = beta_i * y_elem; + result = tmp4 + tmp3; + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + +} /* end BLAS_dgbmv2_s_s */ diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x-f2c.c new file mode 100644 index 00000000..cac401bf --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_dgbmv2_s_s_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, double beta, + double *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_dgbmv2_s_s_x, BLAS_DGBMV2_S_S_X) + + (int *trans, int *m, int *n, int *kl, int *ku, double *alpha, + const float *a, int *lda, const float *head_x, const float *tail_x, + int *incx, double *beta, double *y, int *incy, int *prec) { + BLAS_dgbmv2_s_s_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y, + *incy, (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x.c new file mode 100644 index 00000000..8eb8369e --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x.c @@ -0,0 +1,584 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_dgbmv2_s_s_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, double beta, + double *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) double + * + * AB (input) float* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) float* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) double + * + * y (input) const double* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_dgbmv2_s_s_x"; + + switch (prec) { + case blas_prec_single: + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = y; + const float *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + double alpha_i = alpha; + double beta_i = beta; + double tmp1; + double tmp2; + double tmp3; + double tmp4; + double result; + double sum1; + double sum2; + double prod; + float a_elem; + float x_elem; + double y_elem; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = (double) x_elem *a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = (double) x_elem *a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + tmp1 = sum1 * alpha_i; + tmp2 = sum2 * alpha_i; + tmp3 = tmp1 + tmp2; + y_elem = y_i[iy]; + tmp4 = beta_i * y_elem; + result = tmp4 + tmp3; + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = y; + const float *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + double alpha_i = alpha; + double beta_i = beta; + double head_tmp1, tail_tmp1; + double head_tmp2, tail_tmp2; + double head_tmp3, tail_tmp3; + double head_tmp4, tail_tmp4; + double result; + double head_sum1, tail_sum1; + double head_sum2, tail_sum2; + double head_prod, tail_prod; + float a_elem; + float x_elem; + double y_elem; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1 = tail_sum1 = 0.0; + head_sum2 = tail_sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + head_prod = (double) x_elem *a_elem; + tail_prod = 0.0; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum1 + head_prod; + bv = s1 - head_sum1; + s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum1 + tail_prod; + bv = t1 - tail_sum1; + t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum1 = t1 + t2; + tail_sum1 = t2 - (head_sum1 - t1); + } + x_elem = tail_x_i[jx]; + head_prod = (double) x_elem *a_elem; + tail_prod = 0.0; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum2 + head_prod; + bv = s1 - head_sum2; + s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum2 + tail_prod; + bv = t1 - tail_sum2; + t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum2 = t1 + t2; + tail_sum2 = t2 - (head_sum2 - t1); + } + aij += incaij; + jx += incx; + } + + + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum1 * split; + a11 = con - head_sum1; + a11 = con - a11; + a21 = head_sum1 - a11; + con = alpha_i * split; + b1 = con - alpha_i; + b1 = con - b1; + b2 = alpha_i - b1; + + c11 = head_sum1 * alpha_i; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum1 * alpha_i; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_tmp1 = t1 + t2; + tail_tmp1 = t2 - (head_tmp1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum2 * split; + a11 = con - head_sum2; + a11 = con - a11; + a21 = head_sum2 - a11; + con = alpha_i * split; + b1 = con - alpha_i; + b1 = con - b1; + b2 = alpha_i - b1; + + c11 = head_sum2 * alpha_i; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum2 * alpha_i; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_tmp2 = t1 + t2; + tail_tmp2 = t2 - (head_tmp2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_tmp1 + head_tmp2; + bv = s1 - head_tmp1; + s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_tmp1 + tail_tmp2; + bv = t1 - tail_tmp1; + t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_tmp3 = t1 + t2; + tail_tmp3 = t2 - (head_tmp3 - t1); + } + y_elem = y_i[iy]; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i * split; + a1 = con - beta_i; + a1 = con - a1; + a2 = beta_i - a1; + con = y_elem * split; + b1 = con - y_elem; + b1 = con - b1; + b2 = y_elem - b1; + + head_tmp4 = beta_i * y_elem; + tail_tmp4 = (((a1 * b1 - head_tmp4) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_tmp4 + head_tmp3; + bv = s1 - head_tmp4; + s2 = ((head_tmp3 - bv) + (head_tmp4 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_tmp4 + tail_tmp3; + bv = t1 - tail_tmp4; + t2 = ((tail_tmp3 - bv) + (tail_tmp4 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result = t1 + t2; + } + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_dgbmv2_s_s_x */ diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_x-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_x-f2c.c new file mode 100644 index 00000000..983c9585 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_dgbmv2_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const double *a, int lda, const double *head_x, + const double *tail_x, int incx, double beta, + double *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_dgbmv2_x, BLAS_DGBMV2_X) + + (int *trans, int *m, int *n, int *kl, int *ku, double *alpha, + const double *a, int *lda, const double *head_x, const double *tail_x, + int *incx, double *beta, double *y, int *incy, int *prec) { + BLAS_dgbmv2_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y, *incy, + (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_x.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_x.c new file mode 100644 index 00000000..e66c3edf --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_x.c @@ -0,0 +1,614 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_dgbmv2_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, double alpha, + const double *a, int lda, const double *head_x, + const double *tail_x, int incx, double beta, + double *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) double + * + * AB (input) double* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) double* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) double + * + * y (input) const double* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_dgbmv2_x"; + + switch (prec) { + case blas_prec_single: + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = y; + const double *a_i = a; + const double *head_x_i = head_x; + const double *tail_x_i = tail_x; + double alpha_i = alpha; + double beta_i = beta; + double tmp1; + double tmp2; + double tmp3; + double tmp4; + double result; + double sum1; + double sum2; + double prod; + double a_elem; + double x_elem; + double y_elem; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = x_elem * a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = x_elem * a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + tmp1 = sum1 * alpha_i; + tmp2 = sum2 * alpha_i; + tmp3 = tmp1 + tmp2; + y_elem = y_i[iy]; + tmp4 = beta_i * y_elem; + result = tmp4 + tmp3; + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = y; + const double *a_i = a; + const double *head_x_i = head_x; + const double *tail_x_i = tail_x; + double alpha_i = alpha; + double beta_i = beta; + double head_tmp1, tail_tmp1; + double head_tmp2, tail_tmp2; + double head_tmp3, tail_tmp3; + double head_tmp4, tail_tmp4; + double result; + double head_sum1, tail_sum1; + double head_sum2, tail_sum2; + double head_prod, tail_prod; + double a_elem; + double x_elem; + double y_elem; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1 = tail_sum1 = 0.0; + head_sum2 = tail_sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = a_elem * split; + b1 = con - a_elem; + b1 = con - b1; + b2 = a_elem - b1; + + head_prod = x_elem * a_elem; + tail_prod = + (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum1 + head_prod; + bv = s1 - head_sum1; + s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum1 + tail_prod; + bv = t1 - tail_sum1; + t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum1 = t1 + t2; + tail_sum1 = t2 - (head_sum1 - t1); + } + x_elem = tail_x_i[jx]; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = a_elem * split; + b1 = con - a_elem; + b1 = con - b1; + b2 = a_elem - b1; + + head_prod = x_elem * a_elem; + tail_prod = + (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum2 + head_prod; + bv = s1 - head_sum2; + s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum2 + tail_prod; + bv = t1 - tail_sum2; + t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum2 = t1 + t2; + tail_sum2 = t2 - (head_sum2 - t1); + } + aij += incaij; + jx += incx; + } + + + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum1 * split; + a11 = con - head_sum1; + a11 = con - a11; + a21 = head_sum1 - a11; + con = alpha_i * split; + b1 = con - alpha_i; + b1 = con - b1; + b2 = alpha_i - b1; + + c11 = head_sum1 * alpha_i; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum1 * alpha_i; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_tmp1 = t1 + t2; + tail_tmp1 = t2 - (head_tmp1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum2 * split; + a11 = con - head_sum2; + a11 = con - a11; + a21 = head_sum2 - a11; + con = alpha_i * split; + b1 = con - alpha_i; + b1 = con - b1; + b2 = alpha_i - b1; + + c11 = head_sum2 * alpha_i; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum2 * alpha_i; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_tmp2 = t1 + t2; + tail_tmp2 = t2 - (head_tmp2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_tmp1 + head_tmp2; + bv = s1 - head_tmp1; + s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_tmp1 + tail_tmp2; + bv = t1 - tail_tmp1; + t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_tmp3 = t1 + t2; + tail_tmp3 = t2 - (head_tmp3 - t1); + } + y_elem = y_i[iy]; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i * split; + a1 = con - beta_i; + a1 = con - a1; + a2 = beta_i - a1; + con = y_elem * split; + b1 = con - y_elem; + b1 = con - b1; + b2 = y_elem - b1; + + head_tmp4 = beta_i * y_elem; + tail_tmp4 = (((a1 * b1 - head_tmp4) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_tmp4 + head_tmp3; + bv = s1 - head_tmp4; + s2 = ((head_tmp3 - bv) + (head_tmp4 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_tmp4 + tail_tmp3; + bv = t1 - tail_tmp4; + t2 = ((tail_tmp3 - bv) + (tail_tmp4 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result = t1 + t2; + } + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_dgbmv2_x */ diff --git a/XBLAS/src/gbmv2/BLAS_sgbmv2_x-f2c.c b/XBLAS/src/gbmv2/BLAS_sgbmv2_x-f2c.c new file mode 100644 index 00000000..6454c4f5 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_sgbmv2_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_sgbmv2_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, float alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, float beta, + float *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_sgbmv2_x, BLAS_SGBMV2_X) + + (int *trans, int *m, int *n, int *kl, int *ku, float *alpha, const float *a, + int *lda, const float *head_x, const float *tail_x, int *incx, float *beta, + float *y, int *incy, int *prec) { + BLAS_sgbmv2_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y, *incy, + (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_sgbmv2_x.c b/XBLAS/src/gbmv2/BLAS_sgbmv2_x.c new file mode 100644 index 00000000..c5397379 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_sgbmv2_x.c @@ -0,0 +1,727 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_sgbmv2_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, float alpha, + const float *a, int lda, const float *head_x, + const float *tail_x, int incx, float beta, + float *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) float + * + * AB (input) float* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) float* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) float + * + * y (input) const float* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_sgbmv2_x"; + + switch (prec) { + case blas_prec_single:{ + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = y; + const float *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + float alpha_i = alpha; + float beta_i = beta; + float tmp1; + float tmp2; + float tmp3; + float tmp4; + float result; + float sum1; + float sum2; + float prod; + float a_elem; + float x_elem; + float y_elem; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = x_elem * a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = x_elem * a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + tmp1 = sum1 * alpha_i; + tmp2 = sum2 * alpha_i; + tmp3 = tmp1 + tmp2; + y_elem = y_i[iy]; + tmp4 = beta_i * y_elem; + result = tmp4 + tmp3; + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + + break; + } + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = y; + const float *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + float alpha_i = alpha; + float beta_i = beta; + double tmp1; + double tmp2; + double tmp3; + double tmp4; + float result; + double sum1; + double sum2; + double prod; + float a_elem; + float x_elem; + float y_elem; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = (double) x_elem *a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = (double) x_elem *a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + tmp1 = sum1 * alpha_i; + tmp2 = sum2 * alpha_i; + tmp3 = tmp1 + tmp2; + y_elem = y_i[iy]; + tmp4 = (double) beta_i *y_elem; + result = tmp4 + tmp3; + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + float *y_i = y; + const float *a_i = a; + const float *head_x_i = head_x; + const float *tail_x_i = tail_x; + float alpha_i = alpha; + float beta_i = beta; + double head_tmp1, tail_tmp1; + double head_tmp2, tail_tmp2; + double head_tmp3, tail_tmp3; + double head_tmp4, tail_tmp4; + float result; + double head_sum1, tail_sum1; + double head_sum2, tail_sum2; + double head_prod, tail_prod; + float a_elem; + float x_elem; + float y_elem; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i == 0.0) && (beta_i == 1.0)) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + + + + + + + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1 = tail_sum1 = 0.0; + head_sum2 = tail_sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + head_prod = (double) x_elem *a_elem; + tail_prod = 0.0; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum1 + head_prod; + bv = s1 - head_sum1; + s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum1 + tail_prod; + bv = t1 - tail_sum1; + t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum1 = t1 + t2; + tail_sum1 = t2 - (head_sum1 - t1); + } + x_elem = tail_x_i[jx]; + head_prod = (double) x_elem *a_elem; + tail_prod = 0.0; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum2 + head_prod; + bv = s1 - head_sum2; + s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum2 + tail_prod; + bv = t1 - tail_sum2; + t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum2 = t1 + t2; + tail_sum2 = t2 - (head_sum2 - t1); + } + aij += incaij; + jx += incx; + } + + + { + double dt = (double) alpha_i; + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum1 * split; + a11 = con - head_sum1; + a11 = con - a11; + a21 = head_sum1 - a11; + con = dt * split; + b1 = con - dt; + b1 = con - b1; + b2 = dt - b1; + + c11 = head_sum1 * dt; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum1 * dt; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_tmp1 = t1 + t2; + tail_tmp1 = t2 - (head_tmp1 - t1); + } + } + { + double dt = (double) alpha_i; + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum2 * split; + a11 = con - head_sum2; + a11 = con - a11; + a21 = head_sum2 - a11; + con = dt * split; + b1 = con - dt; + b1 = con - b1; + b2 = dt - b1; + + c11 = head_sum2 * dt; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum2 * dt; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_tmp2 = t1 + t2; + tail_tmp2 = t2 - (head_tmp2 - t1); + } + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_tmp1 + head_tmp2; + bv = s1 - head_tmp1; + s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_tmp1 + tail_tmp2; + bv = t1 - tail_tmp1; + t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_tmp3 = t1 + t2; + tail_tmp3 = t2 - (head_tmp3 - t1); + } + y_elem = y_i[iy]; + head_tmp4 = (double) beta_i *y_elem; + tail_tmp4 = 0.0; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_tmp4 + head_tmp3; + bv = s1 - head_tmp4; + s2 = ((head_tmp3 - bv) + (head_tmp4 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_tmp4 + tail_tmp3; + bv = t1 - tail_tmp4; + t2 = ((tail_tmp3 - bv) + (tail_tmp4 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result = t1 + t2; + } + y_i[iy] = result; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_sgbmv2_x */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c-f2c.c new file mode 100644 index 00000000..2165e22b --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c-f2c.c @@ -0,0 +1,18 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_c_c(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy); + + +extern void FC_FUNC_(blas_zgbmv2_c_c, BLAS_ZGBMV2_C_C) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const void *a, int *lda, const void *head_x, const void *tail_x, int *incx, + const void *beta, void *y, int *incy) { + BLAS_zgbmv2_c_c(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c.c new file mode 100644 index 00000000..6da002ea --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c.c @@ -0,0 +1,314 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_c_c(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) void* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) void* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_c_c"; + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const float *a_i = (float *) a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + float a_elem[2]; + float x_elem[2]; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + +} /* end BLAS_zgbmv2_c_c */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x-f2c.c new file mode 100644 index 00000000..368eec3b --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_c_c_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_zgbmv2_c_c_x, BLAS_ZGBMV2_C_C_X) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const void *a, int *lda, const void *head_x, const void *tail_x, int *incx, + const void *beta, void *y, int *incy, int *prec) { + BLAS_zgbmv2_c_c_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, + *incy, (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x.c new file mode 100644 index 00000000..108a3062 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x.c @@ -0,0 +1,1560 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_c_c_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) void* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) void* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_c_c_x"; + + switch (prec) { + case blas_prec_single: + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const float *a_i = (float *) a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + float a_elem[2]; + float x_elem[2]; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = + (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = + (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = + (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = + (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const float *a_i = (float *) a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double head_tmp1[2], tail_tmp1[2]; + double head_tmp2[2], tail_tmp2[2]; + double head_tmp3[2], tail_tmp3[2]; + double head_tmp4[2], tail_tmp4[2]; + double result[2]; + double head_sum1[2], tail_sum1[2]; + double head_sum2[2], tail_sum2[2]; + double head_prod[2], tail_prod[2]; + float a_elem[2]; + float x_elem[2]; + double y_elem[2]; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0; + head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + double head_e1, tail_e1; + double d1; + double d2; + /* Real part */ + d1 = (double) x_elem[0] * a_elem[0]; + d2 = (double) -x_elem[1] * a_elem[1]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[0] = head_e1; + tail_prod[0] = tail_e1; + /* imaginary part */ + d1 = (double) x_elem[0] * a_elem[1]; + d2 = (double) x_elem[1] * a_elem[0]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[1] = head_e1; + tail_prod[1] = tail_e1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + double head_e1, tail_e1; + double d1; + double d2; + /* Real part */ + d1 = (double) x_elem[0] * a_elem[0]; + d2 = (double) -x_elem[1] * a_elem[1]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[0] = head_e1; + tail_prod[0] = tail_e1; + /* imaginary part */ + d1 = (double) x_elem[0] * a_elem[1]; + d2 = (double) x_elem[1] * a_elem[0]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[1] = head_e1; + tail_prod[1] = tail_e1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + double head_e1, tail_e1; + double d1; + double d2; + /* Real part */ + d1 = (double) x_elem[0] * a_elem[0]; + d2 = (double) -x_elem[1] * a_elem[1]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[0] = head_e1; + tail_prod[0] = tail_e1; + /* imaginary part */ + d1 = (double) x_elem[0] * a_elem[1]; + d2 = (double) x_elem[1] * a_elem[0]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[1] = head_e1; + tail_prod[1] = tail_e1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + double head_e1, tail_e1; + double d1; + double d2; + /* Real part */ + d1 = (double) x_elem[0] * a_elem[0]; + d2 = (double) -x_elem[1] * a_elem[1]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[0] = head_e1; + tail_prod[0] = tail_e1; + /* imaginary part */ + d1 = (double) x_elem[0] * a_elem[1]; + d2 = (double) x_elem[1] * a_elem[0]; + { + /* Compute double-double = double + double. */ + double e, t1, t2; + + /* Knuth trick. */ + t1 = d1 + d2; + e = t1 - d1; + t2 = ((d2 - e) + (d1 - (t1 - e))); + + /* The result is t1 + t2, after normalization. */ + head_e1 = t1 + t2; + tail_e1 = t2 - (head_e1 - t1); + } + head_prod[1] = head_e1; + tail_prod[1] = tail_e1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + } + + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum1[0]; + tail_a0 = tail_sum1[0]; + head_a1 = head_sum1[1]; + tail_a1 = tail_sum1[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a0 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[0] = head_t1; + tail_tmp1[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a0 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[1] = head_t1; + tail_tmp1[1] = tail_t1; + } + + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum2[0]; + tail_a0 = tail_sum2[0]; + head_a1 = head_sum2[1]; + tail_a1 = tail_sum2[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a0 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[0] = head_t1; + tail_tmp2[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a0 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[1] = head_t1; + tail_tmp2[1] = tail_t1; + } + + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp1[0]; + tail_a = tail_tmp1[0]; + head_b = head_tmp2[0]; + tail_b = tail_tmp2[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[0] = head_t; + tail_tmp3[0] = tail_t; + /* Imaginary part */ + head_a = head_tmp1[1]; + tail_a = tail_tmp1[1]; + head_b = head_tmp2[1]; + tail_b = tail_tmp2[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[1] = head_t; + tail_tmp3[1] = tail_t; + } + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[0] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[1] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[0] = head_t1; + tail_tmp4[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[1] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[0] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[1] = head_t1; + tail_tmp4[1] = tail_t1; + } + { + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp4[0]; + tail_a = tail_tmp4[0]; + head_b = head_tmp3[0]; + tail_b = tail_tmp3[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[0] = t1 + t2; + } + /* Imaginary part */ + head_a = head_tmp4[1]; + tail_a = tail_tmp4[1]; + head_b = head_tmp3[1]; + tail_b = tail_tmp3[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[1] = t1 + t2; + } + } + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_zgbmv2_c_c_x */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z-f2c.c new file mode 100644 index 00000000..df7b7cc8 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z-f2c.c @@ -0,0 +1,18 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_c_z(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy); + + +extern void FC_FUNC_(blas_zgbmv2_c_z, BLAS_ZGBMV2_C_Z) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const void *a, int *lda, const void *head_x, const void *tail_x, int *incx, + const void *beta, void *y, int *incy) { + BLAS_zgbmv2_c_z(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z.c new file mode 100644 index 00000000..bb4ad742 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z.c @@ -0,0 +1,314 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_c_z(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) void* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) void* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_c_z"; + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const float *a_i = (float *) a; + const double *head_x_i = (double *) head_x; + const double *tail_x_i = (double *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + float a_elem[2]; + double x_elem[2]; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + +} /* end BLAS_zgbmv2_c_z */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x-f2c.c new file mode 100644 index 00000000..b29d074d --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_c_z_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_zgbmv2_c_z_x, BLAS_ZGBMV2_C_Z_X) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const void *a, int *lda, const void *head_x, const void *tail_x, int *incx, + const void *beta, void *y, int *incy, int *prec) { + BLAS_zgbmv2_c_z_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, + *incy, (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x.c new file mode 100644 index 00000000..4772c683 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x.c @@ -0,0 +1,1956 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_c_z_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) void* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) void* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_c_z_x"; + + switch (prec) { + case blas_prec_single: + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const float *a_i = (float *) a; + const double *head_x_i = (double *) head_x; + const double *tail_x_i = (double *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + float a_elem[2]; + double x_elem[2]; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = + (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = + (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = + (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = + (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const float *a_i = (float *) a; + const double *head_x_i = (double *) head_x; + const double *tail_x_i = (double *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double head_tmp1[2], tail_tmp1[2]; + double head_tmp2[2], tail_tmp2[2]; + double head_tmp3[2], tail_tmp3[2]; + double head_tmp4[2], tail_tmp4[2]; + double result[2]; + double head_sum1[2], tail_sum1[2]; + double head_sum2[2], tail_sum2[2]; + double head_prod[2], tail_prod[2]; + float a_elem[2]; + double x_elem[2]; + double y_elem[2]; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0; + head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + double cd[2]; + cd[0] = (double) a_elem[0]; + cd[1] = (double) a_elem[1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = x_elem[0] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = x_elem[1] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[0] = head_t1; + tail_prod[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = x_elem[1] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = x_elem[0] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[1] = head_t1; + tail_prod[1] = tail_t1; + } + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + double cd[2]; + cd[0] = (double) a_elem[0]; + cd[1] = (double) a_elem[1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = x_elem[0] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = x_elem[1] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[0] = head_t1; + tail_prod[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = x_elem[1] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = x_elem[0] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[1] = head_t1; + tail_prod[1] = tail_t1; + } + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + double cd[2]; + cd[0] = (double) a_elem[0]; + cd[1] = (double) a_elem[1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = x_elem[0] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = x_elem[1] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[0] = head_t1; + tail_prod[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = x_elem[1] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = x_elem[0] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[1] = head_t1; + tail_prod[1] = tail_t1; + } + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + double cd[2]; + cd[0] = (double) a_elem[0]; + cd[1] = (double) a_elem[1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = x_elem[0] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = x_elem[1] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[0] = head_t1; + tail_prod[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = x_elem[1] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = x_elem[0] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[1] = head_t1; + tail_prod[1] = tail_t1; + } + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + } + + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum1[0]; + tail_a0 = tail_sum1[0]; + head_a1 = head_sum1[1]; + tail_a1 = tail_sum1[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a0 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[0] = head_t1; + tail_tmp1[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a0 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[1] = head_t1; + tail_tmp1[1] = tail_t1; + } + + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum2[0]; + tail_a0 = tail_sum2[0]; + head_a1 = head_sum2[1]; + tail_a1 = tail_sum2[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a0 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[0] = head_t1; + tail_tmp2[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a0 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[1] = head_t1; + tail_tmp2[1] = tail_t1; + } + + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp1[0]; + tail_a = tail_tmp1[0]; + head_b = head_tmp2[0]; + tail_b = tail_tmp2[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[0] = head_t; + tail_tmp3[0] = tail_t; + /* Imaginary part */ + head_a = head_tmp1[1]; + tail_a = tail_tmp1[1]; + head_b = head_tmp2[1]; + tail_b = tail_tmp2[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[1] = head_t; + tail_tmp3[1] = tail_t; + } + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[0] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[1] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[0] = head_t1; + tail_tmp4[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[1] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[0] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[1] = head_t1; + tail_tmp4[1] = tail_t1; + } + { + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp4[0]; + tail_a = tail_tmp4[0]; + head_b = head_tmp3[0]; + tail_b = tail_tmp3[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[0] = t1 + t2; + } + /* Imaginary part */ + head_a = head_tmp4[1]; + tail_a = tail_tmp4[1]; + head_b = head_tmp3[1]; + tail_b = tail_tmp3[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[1] = t1 + t2; + } + } + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_zgbmv2_c_z_x */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d-f2c.c new file mode 100644 index 00000000..cb517912 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d-f2c.c @@ -0,0 +1,18 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_d_d(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const double *a, int lda, const double *head_x, + const double *tail_x, int incx, const void *beta, + void *y, int incy); + + +extern void FC_FUNC_(blas_zgbmv2_d_d, BLAS_ZGBMV2_D_D) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const double *a, int *lda, const double *head_x, const double *tail_x, + int *incx, const void *beta, void *y, int *incy) { + BLAS_zgbmv2_d_d(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d.c new file mode 100644 index 00000000..6c59b71d --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d.c @@ -0,0 +1,270 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_d_d(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const double *a, int lda, const double *head_x, + const double *tail_x, int incx, const void *beta, + void *y, int incy) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) double* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) double* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_d_d"; + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = a; + const double *head_x_i = head_x; + const double *tail_x_i = tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1; + double sum2; + double prod; + double a_elem; + double x_elem; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + + + + + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = x_elem * a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = x_elem * a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + { + tmp1[0] = alpha_i[0] * sum1; + tmp1[1] = alpha_i[1] * sum1; + } + { + tmp2[0] = alpha_i[0] * sum2; + tmp2[1] = alpha_i[1] * sum2; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + +} /* end BLAS_zgbmv2_d_d */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x-f2c.c new file mode 100644 index 00000000..44031fb6 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_d_d_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const double *a, int lda, const double *head_x, + const double *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_zgbmv2_d_d_x, BLAS_ZGBMV2_D_D_X) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const double *a, int *lda, const double *head_x, const double *tail_x, + int *incx, const void *beta, void *y, int *incy, int *prec) { + BLAS_zgbmv2_d_d_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, + *incy, (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x.c new file mode 100644 index 00000000..99329c0e --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x.c @@ -0,0 +1,896 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_d_d_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const double *a, int lda, const double *head_x, + const double *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) double* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) double* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_d_d_x"; + + switch (prec) { + case blas_prec_single: + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = a; + const double *head_x_i = head_x; + const double *tail_x_i = tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1; + double sum2; + double prod; + double a_elem; + double x_elem; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + + + + + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1 = 0.0; + sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + prod = x_elem * a_elem; + sum1 = sum1 + prod; + x_elem = tail_x_i[jx]; + prod = x_elem * a_elem; + sum2 = sum2 + prod; + aij += incaij; + jx += incx; + } + + + { + tmp1[0] = alpha_i[0] * sum1; + tmp1[1] = alpha_i[1] * sum1; + } + { + tmp2[0] = alpha_i[0] * sum2; + tmp2[1] = alpha_i[1] * sum2; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = a; + const double *head_x_i = head_x; + const double *tail_x_i = tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double head_tmp1[2], tail_tmp1[2]; + double head_tmp2[2], tail_tmp2[2]; + double head_tmp3[2], tail_tmp3[2]; + double head_tmp4[2], tail_tmp4[2]; + double result[2]; + double head_sum1, tail_sum1; + double head_sum2, tail_sum2; + double head_prod, tail_prod; + double a_elem; + double x_elem; + double y_elem[2]; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + + + + + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1 = tail_sum1 = 0.0; + head_sum2 = tail_sum2 = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem = a_i[aij]; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = a_elem * split; + b1 = con - a_elem; + b1 = con - b1; + b2 = a_elem - b1; + + head_prod = x_elem * a_elem; + tail_prod = + (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum1 + head_prod; + bv = s1 - head_sum1; + s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum1 + tail_prod; + bv = t1 - tail_sum1; + t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum1 = t1 + t2; + tail_sum1 = t2 - (head_sum1 - t1); + } + x_elem = tail_x_i[jx]; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = a_elem * split; + b1 = con - a_elem; + b1 = con - b1; + b2 = a_elem - b1; + + head_prod = x_elem * a_elem; + tail_prod = + (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_sum2 + head_prod; + bv = s1 - head_sum2; + s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_sum2 + tail_prod; + bv = t1 - tail_sum2; + t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_sum2 = t1 + t2; + tail_sum2 = t2 - (head_sum2 - t1); + } + aij += incaij; + jx += incx; + } + + + { + /* Compute complex-extra = complex-double * real. */ + double head_t, tail_t; + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum1 * split; + a11 = con - head_sum1; + a11 = con - a11; + a21 = head_sum1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_sum1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp1[0] = head_t; + tail_tmp1[0] = tail_t; + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum1 * split; + a11 = con - head_sum1; + a11 = con - a11; + a21 = head_sum1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_sum1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp1[1] = head_t; + tail_tmp1[1] = tail_t; + } + { + /* Compute complex-extra = complex-double * real. */ + double head_t, tail_t; + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum2 * split; + a11 = con - head_sum2; + a11 = con - a11; + a21 = head_sum2 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_sum2 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum2 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp2[0] = head_t; + tail_tmp2[0] = tail_t; + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_sum2 * split; + a11 = con - head_sum2; + a11 = con - a11; + a21 = head_sum2 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_sum2 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_sum2 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp2[1] = head_t; + tail_tmp2[1] = tail_t; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp1[0]; + tail_a = tail_tmp1[0]; + head_b = head_tmp2[0]; + tail_b = tail_tmp2[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[0] = head_t; + tail_tmp3[0] = tail_t; + /* Imaginary part */ + head_a = head_tmp1[1]; + tail_a = tail_tmp1[1]; + head_b = head_tmp2[1]; + tail_b = tail_tmp2[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[1] = head_t; + tail_tmp3[1] = tail_t; + } + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[0] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[1] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[0] = head_t1; + tail_tmp4[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[1] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[0] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[1] = head_t1; + tail_tmp4[1] = tail_t1; + } + { + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp4[0]; + tail_a = tail_tmp4[0]; + head_b = head_tmp3[0]; + tail_b = tail_tmp3[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[0] = t1 + t2; + } + /* Imaginary part */ + head_a = head_tmp4[1]; + tail_a = tail_tmp4[1]; + head_b = head_tmp3[1]; + tail_b = tail_tmp3[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[1] = t1 + t2; + } + } + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_zgbmv2_d_d_x */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z-f2c.c new file mode 100644 index 00000000..b034cc27 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z-f2c.c @@ -0,0 +1,18 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_d_z(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const double *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy); + + +extern void FC_FUNC_(blas_zgbmv2_d_z, BLAS_ZGBMV2_D_Z) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const double *a, int *lda, const void *head_x, const void *tail_x, + int *incx, const void *beta, void *y, int *incy) { + BLAS_zgbmv2_d_z(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z.c new file mode 100644 index 00000000..6066a47f --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z.c @@ -0,0 +1,280 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_d_z(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const double *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) double* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) void* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_d_z"; + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = a; + const double *head_x_i = (double *) head_x; + const double *tail_x_i = (double *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + double a_elem; + double x_elem[2]; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + + + + + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem = a_i[aij]; + { + prod[0] = x_elem[0] * a_elem; + prod[1] = x_elem[1] * a_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = x_elem[0] * a_elem; + prod[1] = x_elem[1] * a_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + + { + tmp1[0] = (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + +} /* end BLAS_zgbmv2_d_z */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x-f2c.c new file mode 100644 index 00000000..f9bb4d1c --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_d_z_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const double *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_zgbmv2_d_z_x, BLAS_ZGBMV2_D_Z_X) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const double *a, int *lda, const void *head_x, const void *tail_x, + int *incx, const void *beta, void *y, int *incy, int *prec) { + BLAS_zgbmv2_d_z_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, + *incy, (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x.c new file mode 100644 index 00000000..996fa4dc --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x.c @@ -0,0 +1,1274 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_d_z_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const double *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) double* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) void* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_d_z_x"; + + switch (prec) { + case blas_prec_single: + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = a; + const double *head_x_i = (double *) head_x; + const double *tail_x_i = (double *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + double a_elem; + double x_elem[2]; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + + + + + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem = a_i[aij]; + { + prod[0] = x_elem[0] * a_elem; + prod[1] = x_elem[1] * a_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = x_elem[0] * a_elem; + prod[1] = x_elem[1] * a_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + + { + tmp1[0] = + (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = + (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = + (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = + (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = a; + const double *head_x_i = (double *) head_x; + const double *tail_x_i = (double *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double head_tmp1[2], tail_tmp1[2]; + double head_tmp2[2], tail_tmp2[2]; + double head_tmp3[2], tail_tmp3[2]; + double head_tmp4[2], tail_tmp4[2]; + double result[2]; + double head_sum1[2], tail_sum1[2]; + double head_sum2[2], tail_sum2[2]; + double head_prod[2], tail_prod[2]; + double a_elem; + double x_elem[2]; + double y_elem[2]; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + + + + + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0; + head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0; + aij = ai; + jx = ix0; + + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem = a_i[aij]; + { + /* Compute complex-extra = complex-double * real. */ + double head_t, tail_t; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem * split; + a1 = con - a_elem; + a1 = con - a1; + a2 = a_elem - a1; + con = x_elem[0] * split; + b1 = con - x_elem[0]; + b1 = con - b1; + b2 = x_elem[0] - b1; + + head_t = a_elem * x_elem[0]; + tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_prod[0] = head_t; + tail_prod[0] = tail_t; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem * split; + a1 = con - a_elem; + a1 = con - a1; + a2 = a_elem - a1; + con = x_elem[1] * split; + b1 = con - x_elem[1]; + b1 = con - b1; + b2 = x_elem[1] - b1; + + head_t = a_elem * x_elem[1]; + tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_prod[1] = head_t; + tail_prod[1] = tail_t; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + /* Compute complex-extra = complex-double * real. */ + double head_t, tail_t; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem * split; + a1 = con - a_elem; + a1 = con - a1; + a2 = a_elem - a1; + con = x_elem[0] * split; + b1 = con - x_elem[0]; + b1 = con - b1; + b2 = x_elem[0] - b1; + + head_t = a_elem * x_elem[0]; + tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_prod[0] = head_t; + tail_prod[0] = tail_t; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem * split; + a1 = con - a_elem; + a1 = con - a1; + a2 = a_elem - a1; + con = x_elem[1] * split; + b1 = con - x_elem[1]; + b1 = con - b1; + b2 = x_elem[1] - b1; + + head_t = a_elem * x_elem[1]; + tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_prod[1] = head_t; + tail_prod[1] = tail_t; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + + + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum1[0]; + tail_a0 = tail_sum1[0]; + head_a1 = head_sum1[1]; + tail_a1 = tail_sum1[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a0 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[0] = head_t1; + tail_tmp1[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a0 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[1] = head_t1; + tail_tmp1[1] = tail_t1; + } + + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum2[0]; + tail_a0 = tail_sum2[0]; + head_a1 = head_sum2[1]; + tail_a1 = tail_sum2[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a0 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[0] = head_t1; + tail_tmp2[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a0 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[1] = head_t1; + tail_tmp2[1] = tail_t1; + } + + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp1[0]; + tail_a = tail_tmp1[0]; + head_b = head_tmp2[0]; + tail_b = tail_tmp2[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[0] = head_t; + tail_tmp3[0] = tail_t; + /* Imaginary part */ + head_a = head_tmp1[1]; + tail_a = tail_tmp1[1]; + head_b = head_tmp2[1]; + tail_b = tail_tmp2[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[1] = head_t; + tail_tmp3[1] = tail_t; + } + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[0] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[1] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[0] = head_t1; + tail_tmp4[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[1] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[0] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[1] = head_t1; + tail_tmp4[1] = tail_t1; + } + { + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp4[0]; + tail_a = tail_tmp4[0]; + head_b = head_tmp3[0]; + tail_b = tail_tmp3[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[0] = t1 + t2; + } + /* Imaginary part */ + head_a = head_tmp4[1]; + tail_a = tail_tmp4[1]; + head_b = head_tmp3[1]; + tail_b = tail_tmp3[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[1] = t1 + t2; + } + } + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_zgbmv2_d_z_x */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_x-f2c.c new file mode 100644 index 00000000..17af6ea7 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_zgbmv2_x, BLAS_ZGBMV2_X) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const void *a, int *lda, const void *head_x, const void *tail_x, int *incx, + const void *beta, void *y, int *incy, int *prec) { + BLAS_zgbmv2_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy, + (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_x.c new file mode 100644 index 00000000..98cf4658 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_x.c @@ -0,0 +1,1936 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) void* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) void* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_x"; + + switch (prec) { + case blas_prec_single: + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = (double *) a; + const double *head_x_i = (double *) head_x; + const double *tail_x_i = (double *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + double a_elem[2]; + double x_elem[2]; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = + (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = + (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = + (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = + (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = (double *) a; + const double *head_x_i = (double *) head_x; + const double *tail_x_i = (double *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double head_tmp1[2], tail_tmp1[2]; + double head_tmp2[2], tail_tmp2[2]; + double head_tmp3[2], tail_tmp3[2]; + double head_tmp4[2], tail_tmp4[2]; + double result[2]; + double head_sum1[2], tail_sum1[2]; + double head_sum2[2], tail_sum2[2]; + double head_prod[2], tail_prod[2]; + double a_elem[2]; + double x_elem[2]; + double y_elem[2]; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0; + head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = a_elem[0] * split; + b1 = con - a_elem[0]; + b1 = con - b1; + b2 = a_elem[0] - b1; + + head_t1 = x_elem[0] * a_elem[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = a_elem[1] * split; + b1 = con - a_elem[1]; + b1 = con - b1; + b2 = a_elem[1] - b1; + + head_t2 = x_elem[1] * a_elem[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[0] = head_t1; + tail_prod[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = a_elem[0] * split; + b1 = con - a_elem[0]; + b1 = con - b1; + b2 = a_elem[0] - b1; + + head_t1 = x_elem[1] * a_elem[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = a_elem[1] * split; + b1 = con - a_elem[1]; + b1 = con - b1; + b2 = a_elem[1] - b1; + + head_t2 = x_elem[0] * a_elem[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[1] = head_t1; + tail_prod[1] = tail_t1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = a_elem[0] * split; + b1 = con - a_elem[0]; + b1 = con - b1; + b2 = a_elem[0] - b1; + + head_t1 = x_elem[0] * a_elem[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = a_elem[1] * split; + b1 = con - a_elem[1]; + b1 = con - b1; + b2 = a_elem[1] - b1; + + head_t2 = x_elem[1] * a_elem[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[0] = head_t1; + tail_prod[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = a_elem[0] * split; + b1 = con - a_elem[0]; + b1 = con - b1; + b2 = a_elem[0] - b1; + + head_t1 = x_elem[1] * a_elem[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = a_elem[1] * split; + b1 = con - a_elem[1]; + b1 = con - b1; + b2 = a_elem[1] - b1; + + head_t2 = x_elem[0] * a_elem[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[1] = head_t1; + tail_prod[1] = tail_t1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = a_elem[0] * split; + b1 = con - a_elem[0]; + b1 = con - b1; + b2 = a_elem[0] - b1; + + head_t1 = x_elem[0] * a_elem[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = a_elem[1] * split; + b1 = con - a_elem[1]; + b1 = con - b1; + b2 = a_elem[1] - b1; + + head_t2 = x_elem[1] * a_elem[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[0] = head_t1; + tail_prod[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = a_elem[0] * split; + b1 = con - a_elem[0]; + b1 = con - b1; + b2 = a_elem[0] - b1; + + head_t1 = x_elem[1] * a_elem[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = a_elem[1] * split; + b1 = con - a_elem[1]; + b1 = con - b1; + b2 = a_elem[1] - b1; + + head_t2 = x_elem[0] * a_elem[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[1] = head_t1; + tail_prod[1] = tail_t1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = a_elem[0] * split; + b1 = con - a_elem[0]; + b1 = con - b1; + b2 = a_elem[0] - b1; + + head_t1 = x_elem[0] * a_elem[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = a_elem[1] * split; + b1 = con - a_elem[1]; + b1 = con - b1; + b2 = a_elem[1] - b1; + + head_t2 = x_elem[1] * a_elem[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[0] = head_t1; + tail_prod[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[1] * split; + a1 = con - x_elem[1]; + a1 = con - a1; + a2 = x_elem[1] - a1; + con = a_elem[0] * split; + b1 = con - a_elem[0]; + b1 = con - b1; + b2 = a_elem[0] - b1; + + head_t1 = x_elem[1] * a_elem[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem[0] * split; + a1 = con - x_elem[0]; + a1 = con - a1; + a2 = x_elem[0] - a1; + con = a_elem[1] * split; + b1 = con - a_elem[1]; + b1 = con - b1; + b2 = a_elem[1] - b1; + + head_t2 = x_elem[0] * a_elem[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[1] = head_t1; + tail_prod[1] = tail_t1; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + } + + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum1[0]; + tail_a0 = tail_sum1[0]; + head_a1 = head_sum1[1]; + tail_a1 = tail_sum1[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a0 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[0] = head_t1; + tail_tmp1[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a0 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[1] = head_t1; + tail_tmp1[1] = tail_t1; + } + + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum2[0]; + tail_a0 = tail_sum2[0]; + head_a1 = head_sum2[1]; + tail_a1 = tail_sum2[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a0 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[0] = head_t1; + tail_tmp2[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a0 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[1] = head_t1; + tail_tmp2[1] = tail_t1; + } + + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp1[0]; + tail_a = tail_tmp1[0]; + head_b = head_tmp2[0]; + tail_b = tail_tmp2[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[0] = head_t; + tail_tmp3[0] = tail_t; + /* Imaginary part */ + head_a = head_tmp1[1]; + tail_a = tail_tmp1[1]; + head_b = head_tmp2[1]; + tail_b = tail_tmp2[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[1] = head_t; + tail_tmp3[1] = tail_t; + } + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[0] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[1] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[0] = head_t1; + tail_tmp4[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[1] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[0] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[1] = head_t1; + tail_tmp4[1] = tail_t1; + } + { + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp4[0]; + tail_a = tail_tmp4[0]; + head_b = head_tmp3[0]; + tail_b = tail_tmp3[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[0] = t1 + t2; + } + /* Imaginary part */ + head_a = head_tmp4[1]; + tail_a = tail_tmp4[1]; + head_b = head_tmp3[1]; + tail_b = tail_tmp3[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[1] = t1 + t2; + } + } + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_zgbmv2_x */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c-f2c.c new file mode 100644 index 00000000..e51d9551 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c-f2c.c @@ -0,0 +1,18 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_z_c(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy); + + +extern void FC_FUNC_(blas_zgbmv2_z_c, BLAS_ZGBMV2_Z_C) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const void *a, int *lda, const void *head_x, const void *tail_x, int *incx, + const void *beta, void *y, int *incy) { + BLAS_zgbmv2_z_c(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c.c new file mode 100644 index 00000000..6581d257 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c.c @@ -0,0 +1,314 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_z_c(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) void* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) void* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_z_c"; + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = (double *) a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + double a_elem[2]; + float x_elem[2]; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + +} /* end BLAS_zgbmv2_z_c */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x-f2c.c new file mode 100644 index 00000000..45dbc677 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_z_c_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_zgbmv2_z_c_x, BLAS_ZGBMV2_Z_C_X) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const void *a, int *lda, const void *head_x, const void *tail_x, int *incx, + const void *beta, void *y, int *incy, int *prec) { + BLAS_zgbmv2_z_c_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, + *incy, (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x.c new file mode 100644 index 00000000..5b0f1841 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x.c @@ -0,0 +1,1956 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_z_c_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const void *head_x, + const void *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) void* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) void* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_z_c_x"; + + switch (prec) { + case blas_prec_single: + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = (double *) a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + double a_elem[2]; + float x_elem[2]; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + prod[0] = + (double) x_elem[0] * a_elem[0] - + (double) x_elem[1] * a_elem[1]; + prod[1] = + (double) x_elem[0] * a_elem[1] + + (double) x_elem[1] * a_elem[0]; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = + (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = + (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = + (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = + (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = (double *) a; + const float *head_x_i = (float *) head_x; + const float *tail_x_i = (float *) tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double head_tmp1[2], tail_tmp1[2]; + double head_tmp2[2], tail_tmp2[2]; + double head_tmp3[2], tail_tmp3[2]; + double head_tmp4[2], tail_tmp4[2]; + double result[2]; + double head_sum1[2], tail_sum1[2]; + double head_sum2[2], tail_sum2[2]; + double head_prod[2], tail_prod[2]; + double a_elem[2]; + float x_elem[2]; + double y_elem[2]; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + incx *= 2; + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + ix0 *= 2; + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0; + head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + double cd[2]; + cd[0] = (double) x_elem[0]; + cd[1] = (double) x_elem[1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[0] * split; + a1 = con - a_elem[0]; + a1 = con - a1; + a2 = a_elem[0] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = a_elem[0] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[1] * split; + a1 = con - a_elem[1]; + a1 = con - a1; + a2 = a_elem[1] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = a_elem[1] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[0] = head_t1; + tail_prod[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[1] * split; + a1 = con - a_elem[1]; + a1 = con - a1; + a2 = a_elem[1] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = a_elem[1] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[0] * split; + a1 = con - a_elem[0]; + a1 = con - a1; + a2 = a_elem[0] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = a_elem[0] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[1] = head_t1; + tail_prod[1] = tail_t1; + } + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + double cd[2]; + cd[0] = (double) x_elem[0]; + cd[1] = (double) x_elem[1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[0] * split; + a1 = con - a_elem[0]; + a1 = con - a1; + a2 = a_elem[0] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = a_elem[0] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[1] * split; + a1 = con - a_elem[1]; + a1 = con - a1; + a2 = a_elem[1] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = a_elem[1] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[0] = head_t1; + tail_prod[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[1] * split; + a1 = con - a_elem[1]; + a1 = con - a1; + a2 = a_elem[1] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = a_elem[1] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[0] * split; + a1 = con - a_elem[0]; + a1 = con - a1; + a2 = a_elem[0] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = a_elem[0] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[1] = head_t1; + tail_prod[1] = tail_t1; + } + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem[0] = head_x_i[jx]; + x_elem[1] = head_x_i[jx + 1]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + double cd[2]; + cd[0] = (double) x_elem[0]; + cd[1] = (double) x_elem[1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[0] * split; + a1 = con - a_elem[0]; + a1 = con - a1; + a2 = a_elem[0] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = a_elem[0] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[1] * split; + a1 = con - a_elem[1]; + a1 = con - a1; + a2 = a_elem[1] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = a_elem[1] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[0] = head_t1; + tail_prod[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[1] * split; + a1 = con - a_elem[1]; + a1 = con - a1; + a2 = a_elem[1] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = a_elem[1] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[0] * split; + a1 = con - a_elem[0]; + a1 = con - a1; + a2 = a_elem[0] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = a_elem[0] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[1] = head_t1; + tail_prod[1] = tail_t1; + } + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem[0] = tail_x_i[jx]; + x_elem[1] = tail_x_i[jx + 1]; + { + double cd[2]; + cd[0] = (double) x_elem[0]; + cd[1] = (double) x_elem[1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[0] * split; + a1 = con - a_elem[0]; + a1 = con - a1; + a2 = a_elem[0] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = a_elem[0] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[1] * split; + a1 = con - a_elem[1]; + a1 = con - a1; + a2 = a_elem[1] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = a_elem[1] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[0] = head_t1; + tail_prod[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[1] * split; + a1 = con - a_elem[1]; + a1 = con - a1; + a2 = a_elem[1] - a1; + con = cd[0] * split; + b1 = con - cd[0]; + b1 = con - b1; + b2 = cd[0] - b1; + + head_t1 = a_elem[1] * cd[0]; + tail_t1 = + (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = a_elem[0] * split; + a1 = con - a_elem[0]; + a1 = con - a1; + a2 = a_elem[0] - a1; + con = cd[1] * split; + b1 = con - cd[1]; + b1 = con - b1; + b2 = cd[1] - b1; + + head_t2 = a_elem[0] * cd[1]; + tail_t2 = + (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_prod[1] = head_t1; + tail_prod[1] = tail_t1; + } + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + } + + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum1[0]; + tail_a0 = tail_sum1[0]; + head_a1 = head_sum1[1]; + tail_a1 = tail_sum1[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a0 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[0] = head_t1; + tail_tmp1[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a0 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[1] = head_t1; + tail_tmp1[1] = tail_t1; + } + + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum2[0]; + tail_a0 = tail_sum2[0]; + head_a1 = head_sum2[1]; + tail_a1 = tail_sum2[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a0 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[0] = head_t1; + tail_tmp2[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a0 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[1] = head_t1; + tail_tmp2[1] = tail_t1; + } + + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp1[0]; + tail_a = tail_tmp1[0]; + head_b = head_tmp2[0]; + tail_b = tail_tmp2[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[0] = head_t; + tail_tmp3[0] = tail_t; + /* Imaginary part */ + head_a = head_tmp1[1]; + tail_a = tail_tmp1[1]; + head_b = head_tmp2[1]; + tail_b = tail_tmp2[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[1] = head_t; + tail_tmp3[1] = tail_t; + } + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[0] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[1] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[0] = head_t1; + tail_tmp4[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[1] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[0] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[1] = head_t1; + tail_tmp4[1] = tail_t1; + } + { + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp4[0]; + tail_a = tail_tmp4[0]; + head_b = head_tmp3[0]; + tail_b = tail_tmp3[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[0] = t1 + t2; + } + /* Imaginary part */ + head_a = head_tmp4[1]; + tail_a = tail_tmp4[1]; + head_b = head_tmp3[1]; + tail_b = tail_tmp3[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[1] = t1 + t2; + } + } + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_zgbmv2_z_c_x */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d-f2c.c new file mode 100644 index 00000000..9d8451e7 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d-f2c.c @@ -0,0 +1,18 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_z_d(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const double *head_x, + const double *tail_x, int incx, const void *beta, + void *y, int incy); + + +extern void FC_FUNC_(blas_zgbmv2_z_d, BLAS_ZGBMV2_Z_D) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const void *a, int *lda, const double *head_x, const double *tail_x, + int *incx, const void *beta, void *y, int *incy) { + BLAS_zgbmv2_z_d(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d.c new file mode 100644 index 00000000..3a9e537f --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d.c @@ -0,0 +1,302 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_z_d(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const double *head_x, + const double *tail_x, int incx, const void *beta, + void *y, int incy) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) void* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) double* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_z_d"; + + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = (double *) a; + const double *head_x_i = head_x; + const double *tail_x_i = tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + double a_elem[2]; + double x_elem; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem = tail_x_i[jx]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem = tail_x_i[jx]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + +} /* end BLAS_zgbmv2_z_d */ diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x-f2c.c new file mode 100644 index 00000000..fad23ee6 --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x-f2c.c @@ -0,0 +1,19 @@ + +#include "f2c-bridge.h" +#include "blas_enum.h" +void BLAS_zgbmv2_z_d_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const double *head_x, + const double *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec); + + +extern void FC_FUNC_(blas_zgbmv2_z_d_x, BLAS_ZGBMV2_Z_D_X) + + (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha, + const void *a, int *lda, const double *head_x, const double *tail_x, + int *incx, const void *beta, void *y, int *incy, int *prec) { + BLAS_zgbmv2_z_d_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl, + *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, + *incy, (enum blas_prec_type) *prec); +} diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x.c new file mode 100644 index 00000000..df3e530f --- /dev/null +++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x.c @@ -0,0 +1,1532 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +void BLAS_zgbmv2_z_d_x(enum blas_order_type order, enum blas_trans_type trans, + int m, int n, int kl, int ku, const void *alpha, + const void *a, int lda, const double *head_x, + const double *tail_x, int incx, const void *beta, + void *y, int incy, enum blas_prec_type prec) + +/* + * Purpose + * ======= + * + * This routines computes the matrix product: + * + * y <- alpha * op(A) * (x_head + x_tail) + beta * y + * + * where + * + * A is a m x n banded matrix + * x is a n x 1 vector + * y is a m x 1 vector + * alpha and beta are scalars + * + * Arguments + * ========= + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Transpose of AB; no trans, + * trans, or conjugate trans + * + * m (input) int + * Dimension of AB + * + * n (input) int + * Dimension of AB and the length of vector x and z + * + * kl (input) int + * Number of lower diagnols of AB + * + * ku (input) int + * Number of upper diagnols of AB + * + * alpha (input) const void* + * + * AB (input) void* + * + * lda (input) int + * Leading dimension of AB + * lda >= ku + kl + 1 + * + * head_x + * tail_x (input) double* + * + * incx (input) int + * The stride for vector x. + * + * beta (input) const void* + * + * y (input) const void* + * + * incy (input) int + * The stride for vector y. + * + * prec (input) enum blas_prec_type + * Specifies the internal precision to be used. + * = blas_prec_single: single precision. + * = blas_prec_double: double precision. + * = blas_prec_extra : anything at least 1.5 times as accurate + * than double, and wider than 80-bits. + * We use double-double in our implementation. + * + * + * LOCAL VARIABLES + * =============== + * + * As an example, these variables are described on the mxn, column + * major, banded matrix described in section 2.2.3 of the specification + * + * astart indexes first element in A where computation begins + * + * incai1 indexes first element in row where row is less than lbound + * + * incai2 indexes first element in row where row exceeds lbound + * + * lbound denotes the number of rows before first element shifts + * + * rbound denotes the columns where there is blank space + * + * ra index of the rightmost element for a given row + * + * la index of leftmost elements for a given row + * + * ra - la width of a row + * + * rbound + * la ra ____|_____ + * | | | | + * | a00 a01 * * * + * lbound -| a10 a11 a12 * * + * | a20 a21 a22 a23 * + * * a31 a32 a33 a34 + * * * a42 a43 a44 + * + * Varations on order and transpose have been implemented by modifying these + * local variables. + * + */ +{ + static const char routine_name[] = "BLAS_zgbmv2_z_d_x"; + + switch (prec) { + case blas_prec_single: + case blas_prec_double: + case blas_prec_indigenous: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = (double *) a; + const double *head_x_i = head_x; + const double *tail_x_i = tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double tmp1[2]; + double tmp2[2]; + double tmp3[2]; + double tmp4[2]; + double result[2]; + double sum1[2]; + double sum2[2]; + double prod[2]; + double a_elem[2]; + double x_elem; + double y_elem[2]; + + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + sum1[0] = sum1[1] = 0.0; + sum2[0] = sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem = tail_x_i[jx]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum1[0] = sum1[0] + prod[0]; + sum1[1] = sum1[1] + prod[1]; + x_elem = tail_x_i[jx]; + { + prod[0] = a_elem[0] * x_elem; + prod[1] = a_elem[1] * x_elem; + } + sum2[0] = sum2[0] + prod[0]; + sum2[1] = sum2[1] + prod[1]; + aij += incaij; + jx += incx; + } + } + + { + tmp1[0] = + (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1]; + tmp1[1] = + (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0]; + } + { + tmp2[0] = + (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1]; + tmp2[1] = + (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0]; + } + tmp3[0] = tmp1[0] + tmp2[0]; + tmp3[1] = tmp1[1] + tmp2[1]; + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + tmp4[0] = + (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1]; + tmp4[1] = + (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0]; + } + result[0] = tmp4[0] + tmp3[0]; + result[1] = tmp4[1] + tmp3[1]; + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + + } + break; + case blas_prec_extra: + { + int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny; + int incaij, aij, incai1, incai2, astart, ai; + double *y_i = (double *) y; + const double *a_i = (double *) a; + const double *head_x_i = head_x; + const double *tail_x_i = tail_x; + double *alpha_i = (double *) alpha; + double *beta_i = (double *) beta; + double head_tmp1[2], tail_tmp1[2]; + double head_tmp2[2], tail_tmp2[2]; + double head_tmp3[2], tail_tmp3[2]; + double head_tmp4[2], tail_tmp4[2]; + double result[2]; + double head_sum1[2], tail_sum1[2]; + double head_sum2[2], tail_sum2[2]; + double head_prod[2], tail_prod[2]; + double a_elem[2]; + double x_elem; + double y_elem[2]; + FPU_FIX_DECL; + + if (order != blas_colmajor && order != blas_rowmajor) + BLAS_error(routine_name, -1, order, NULL); + if (trans != blas_no_trans && + trans != blas_trans && trans != blas_conj_trans) { + BLAS_error(routine_name, -2, trans, NULL); + } + if (m < 0) + BLAS_error(routine_name, -3, m, NULL); + if (n < 0) + BLAS_error(routine_name, -4, n, NULL); + if (kl < 0 || kl >= m) + BLAS_error(routine_name, -5, kl, NULL); + if (ku < 0 || ku >= n) + BLAS_error(routine_name, -6, ku, NULL); + if (lda < kl + ku + 1) + BLAS_error(routine_name, -9, lda, NULL); + if (incx == 0) + BLAS_error(routine_name, -12, incx, NULL); + if (incy == 0) + BLAS_error(routine_name, -15, incy, NULL); + + if (m == 0 || n == 0) + return; + if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0) + && ((beta_i[0] == 1.0 && beta_i[1] == 0.0))) + return; + + if (trans == blas_no_trans) { + lenx = n; + leny = m; + } else { + lenx = m; + leny = n; + } + + ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx; + iy0 = (incy > 0) ? 0 : -(leny - 1) * incy; + + FPU_FIX_START; + + /* if alpha = 0, return y = y*beta */ + if ((order == blas_colmajor) && (trans == blas_no_trans)) { + astart = ku; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else if ((order == blas_colmajor) && (trans != blas_no_trans)) { + astart = ku; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) { + astart = kl; + incai1 = lda - 1; + incai2 = lda; + incaij = 1; + lbound = kl; + rbound = n - ku - 1; + ra = ku; + } else { /* rowmajor and blas_trans */ + astart = kl; + incai1 = 1; + incai2 = lda; + incaij = lda - 1; + lbound = ku; + rbound = m - kl - 1; + ra = kl; + } + + incy *= 2; + incaij *= 2; + incai1 *= 2; + incai2 *= 2; + astart *= 2; + iy0 *= 2; + + + la = 0; + ai = astart; + iy = iy0; + for (i = 0; i < leny; i++) { + head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0; + head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0; + aij = ai; + jx = ix0; + if (trans != blas_conj_trans) { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + { + /* Compute complex-extra = complex-double * real. */ + double head_t, tail_t; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = a_elem[0] * split; + b1 = con - a_elem[0]; + b1 = con - b1; + b2 = a_elem[0] - b1; + + head_t = x_elem * a_elem[0]; + tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_prod[0] = head_t; + tail_prod[0] = tail_t; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = a_elem[1] * split; + b1 = con - a_elem[1]; + b1 = con - b1; + b2 = a_elem[1] - b1; + + head_t = x_elem * a_elem[1]; + tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_prod[1] = head_t; + tail_prod[1] = tail_t; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem = tail_x_i[jx]; + { + /* Compute complex-extra = complex-double * real. */ + double head_t, tail_t; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = a_elem[0] * split; + b1 = con - a_elem[0]; + b1 = con - b1; + b2 = a_elem[0] - b1; + + head_t = x_elem * a_elem[0]; + tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_prod[0] = head_t; + tail_prod[0] = tail_t; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = a_elem[1] * split; + b1 = con - a_elem[1]; + b1 = con - b1; + b2 = a_elem[1] - b1; + + head_t = x_elem * a_elem[1]; + tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_prod[1] = head_t; + tail_prod[1] = tail_t; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + + } else { + for (j = ra - la; j >= 0; j--) { + x_elem = head_x_i[jx]; + a_elem[0] = a_i[aij]; + a_elem[1] = a_i[aij + 1]; + a_elem[1] = -a_elem[1]; + { + /* Compute complex-extra = complex-double * real. */ + double head_t, tail_t; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = a_elem[0] * split; + b1 = con - a_elem[0]; + b1 = con - b1; + b2 = a_elem[0] - b1; + + head_t = x_elem * a_elem[0]; + tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_prod[0] = head_t; + tail_prod[0] = tail_t; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = a_elem[1] * split; + b1 = con - a_elem[1]; + b1 = con - b1; + b2 = a_elem[1] - b1; + + head_t = x_elem * a_elem[1]; + tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_prod[1] = head_t; + tail_prod[1] = tail_t; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum1[0]; + tail_a = tail_sum1[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[0] = head_t; + tail_sum1[0] = tail_t; + /* Imaginary part */ + head_a = head_sum1[1]; + tail_a = tail_sum1[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum1[1] = head_t; + tail_sum1[1] = tail_t; + } + x_elem = tail_x_i[jx]; + { + /* Compute complex-extra = complex-double * real. */ + double head_t, tail_t; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = a_elem[0] * split; + b1 = con - a_elem[0]; + b1 = con - b1; + b2 = a_elem[0] - b1; + + head_t = x_elem * a_elem[0]; + tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_prod[0] = head_t; + tail_prod[0] = tail_t; + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = x_elem * split; + a1 = con - x_elem; + a1 = con - a1; + a2 = x_elem - a1; + con = a_elem[1] * split; + b1 = con - a_elem[1]; + b1 = con - b1; + b2 = a_elem[1] - b1; + + head_t = x_elem * a_elem[1]; + tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_prod[1] = head_t; + tail_prod[1] = tail_t; + } + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_sum2[0]; + tail_a = tail_sum2[0]; + head_b = head_prod[0]; + tail_b = tail_prod[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[0] = head_t; + tail_sum2[0] = tail_t; + /* Imaginary part */ + head_a = head_sum2[1]; + tail_a = tail_sum2[1]; + head_b = head_prod[1]; + tail_b = tail_prod[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_sum2[1] = head_t; + tail_sum2[1] = tail_t; + } + aij += incaij; + jx += incx; + } + } + + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum1[0]; + tail_a0 = tail_sum1[0]; + head_a1 = head_sum1[1]; + tail_a1 = tail_sum1[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a0 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[0] = head_t1; + tail_tmp1[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a0 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp1[1] = head_t1; + tail_tmp1[1] = tail_t1; + } + + { + /* Compute complex-extra = complex-extra * complex-double. */ + double head_a0, tail_a0; + double head_a1, tail_a1; + double head_t1, tail_t1; + double head_t2, tail_t2; + head_a0 = head_sum2[0]; + tail_a0 = tail_sum2[0]; + head_a1 = head_sum2[1]; + tail_a1 = tail_sum2[1]; + /* real part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a0 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a1 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[0] = head_t1; + tail_tmp2[0] = tail_t1; + /* imaginary part */ + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a1 * split; + a11 = con - head_a1; + a11 = con - a11; + a21 = head_a1 - a11; + con = alpha_i[0] * split; + b1 = con - alpha_i[0]; + b1 = con - b1; + b2 = alpha_i[0] - b1; + + c11 = head_a1 * alpha_i[0]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a1 * alpha_i[0]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + { + /* Compute double-double = double-double * double. */ + double a11, a21, b1, b2, c11, c21, c2, con, t1, t2; + + con = head_a0 * split; + a11 = con - head_a0; + a11 = con - a11; + a21 = head_a0 - a11; + con = alpha_i[1] * split; + b1 = con - alpha_i[1]; + b1 = con - b1; + b2 = alpha_i[1] - b1; + + c11 = head_a0 * alpha_i[1]; + c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2; + + c2 = tail_a0 * alpha_i[1]; + t1 = c11 + c2; + t2 = (c2 - (t1 - c11)) + c21; + + head_t2 = t1 + t2; + tail_t2 = t2 - (head_t2 - t1); + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp2[1] = head_t1; + tail_tmp2[1] = tail_t1; + } + + { + double head_t, tail_t; + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp1[0]; + tail_a = tail_tmp1[0]; + head_b = head_tmp2[0]; + tail_b = tail_tmp2[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[0] = head_t; + tail_tmp3[0] = tail_t; + /* Imaginary part */ + head_a = head_tmp1[1]; + tail_a = tail_tmp1[1]; + head_b = head_tmp2[1]; + tail_b = tail_tmp2[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t = t1 + t2; + tail_t = t2 - (head_t - t1); + } + head_tmp3[1] = head_t; + tail_tmp3[1] = tail_t; + } + y_elem[0] = y_i[iy]; + y_elem[1] = y_i[iy + 1]; + { + /* Compute complex-extra = complex-double * complex-double. */ + double head_t1, tail_t1; + double head_t2, tail_t2; + /* Real part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[0] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[1] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + head_t2 = -head_t2; + tail_t2 = -tail_t2; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[0] = head_t1; + tail_tmp4[0] = tail_t1; + /* Imaginary part */ + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[1] * split; + a1 = con - beta_i[1]; + a1 = con - a1; + a2 = beta_i[1] - a1; + con = y_elem[0] * split; + b1 = con - y_elem[0]; + b1 = con - b1; + b2 = y_elem[0] - b1; + + head_t1 = beta_i[1] * y_elem[0]; + tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double_double = double * double. */ + double a1, a2, b1, b2, con; + + con = beta_i[0] * split; + a1 = con - beta_i[0]; + a1 = con - a1; + a2 = beta_i[0] - a1; + con = y_elem[1] * split; + b1 = con - y_elem[1]; + b1 = con - b1; + b2 = y_elem[1] - b1; + + head_t2 = beta_i[0] * y_elem[1]; + tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2; + } + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_t1 + head_t2; + bv = s1 - head_t1; + s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_t1 + tail_t2; + bv = t1 - tail_t1; + t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + head_t1 = t1 + t2; + tail_t1 = t2 - (head_t1 - t1); + } + head_tmp4[1] = head_t1; + tail_tmp4[1] = tail_t1; + } + { + double head_a, tail_a; + double head_b, tail_b; + /* Real part */ + head_a = head_tmp4[0]; + tail_a = tail_tmp4[0]; + head_b = head_tmp3[0]; + tail_b = tail_tmp3[0]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[0] = t1 + t2; + } + /* Imaginary part */ + head_a = head_tmp4[1]; + tail_a = tail_tmp4[1]; + head_b = head_tmp3[1]; + tail_b = tail_tmp3[1]; + { + /* Compute double-double = double-double + double-double. */ + double bv; + double s1, s2, t1, t2; + + /* Add two hi words. */ + s1 = head_a + head_b; + bv = s1 - head_a; + s2 = ((head_b - bv) + (head_a - (s1 - bv))); + + /* Add two lo words. */ + t1 = tail_a + tail_b; + bv = t1 - tail_a; + t2 = ((tail_b - bv) + (tail_a - (t1 - bv))); + + s2 += t1; + + /* Renormalize (s1, s2) to (t1, s2) */ + t1 = s1 + s2; + s2 = s2 - (t1 - s1); + + t2 += s2; + + /* Renormalize (t1, t2) */ + result[1] = t1 + t2; + } + } + y_i[iy] = result[0]; + y_i[iy + 1] = result[1]; + + iy += incy; + if (i >= lbound) { + ix0 += incx; + ai += incai2; + la++; + } else { + ai += incai1; + } + if (i < rbound) { + ra++; + } + } + + FPU_FIX_STOP; + } + break; + } +} /* end BLAS_zgbmv2_z_d_x */ diff --git a/XBLAS/src/gbmv2/Makefile b/XBLAS/src/gbmv2/Makefile new file mode 100644 index 00000000..3138b90f --- /dev/null +++ b/XBLAS/src/gbmv2/Makefile @@ -0,0 +1,80 @@ +include ../../make.conf +include ../../$(MAKEINC) + +LIB_PATH = ../../$(OUTPUT_DIR) +SRC_PATH = . +HEADER_PATH = .. + +GBMV2_SRCS =\ + $(SRC_PATH)/BLAS_dgbmv2_d_s.c\ + $(SRC_PATH)/BLAS_dgbmv2_s_d.c\ + $(SRC_PATH)/BLAS_dgbmv2_s_s.c\ + $(SRC_PATH)/BLAS_zgbmv2_z_c.c\ + $(SRC_PATH)/BLAS_zgbmv2_c_z.c\ + $(SRC_PATH)/BLAS_zgbmv2_c_c.c\ + $(SRC_PATH)/BLAS_cgbmv2_c_s.c\ + $(SRC_PATH)/BLAS_cgbmv2_s_c.c\ + $(SRC_PATH)/BLAS_cgbmv2_s_s.c\ + $(SRC_PATH)/BLAS_zgbmv2_z_d.c\ + $(SRC_PATH)/BLAS_zgbmv2_d_z.c\ + $(SRC_PATH)/BLAS_zgbmv2_d_d.c\ + $(SRC_PATH)/BLAS_sgbmv2_x.c\ + $(SRC_PATH)/BLAS_dgbmv2_x.c\ + $(SRC_PATH)/BLAS_cgbmv2_x.c\ + $(SRC_PATH)/BLAS_zgbmv2_x.c\ + $(SRC_PATH)/BLAS_dgbmv2_d_s_x.c\ + $(SRC_PATH)/BLAS_dgbmv2_s_d_x.c\ + $(SRC_PATH)/BLAS_dgbmv2_s_s_x.c\ + $(SRC_PATH)/BLAS_zgbmv2_z_c_x.c\ + $(SRC_PATH)/BLAS_zgbmv2_c_z_x.c\ + $(SRC_PATH)/BLAS_zgbmv2_c_c_x.c\ + $(SRC_PATH)/BLAS_cgbmv2_c_s_x.c\ + $(SRC_PATH)/BLAS_cgbmv2_s_c_x.c\ + $(SRC_PATH)/BLAS_cgbmv2_s_s_x.c\ + $(SRC_PATH)/BLAS_zgbmv2_z_d_x.c\ + $(SRC_PATH)/BLAS_zgbmv2_d_z_x.c\ + $(SRC_PATH)/BLAS_zgbmv2_d_d_x.c \ + \ + $(SRC_PATH)/BLAS_dgbmv2_d_s-f2c.c\ + $(SRC_PATH)/BLAS_dgbmv2_s_d-f2c.c\ + $(SRC_PATH)/BLAS_dgbmv2_s_s-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_z_c-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_c_z-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_c_c-f2c.c\ + $(SRC_PATH)/BLAS_cgbmv2_c_s-f2c.c\ + $(SRC_PATH)/BLAS_cgbmv2_s_c-f2c.c\ + $(SRC_PATH)/BLAS_cgbmv2_s_s-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_z_d-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_d_z-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_d_d-f2c.c\ + $(SRC_PATH)/BLAS_sgbmv2_x-f2c.c\ + $(SRC_PATH)/BLAS_dgbmv2_x-f2c.c\ + $(SRC_PATH)/BLAS_cgbmv2_x-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_x-f2c.c\ + $(SRC_PATH)/BLAS_dgbmv2_d_s_x-f2c.c\ + $(SRC_PATH)/BLAS_dgbmv2_s_d_x-f2c.c\ + $(SRC_PATH)/BLAS_dgbmv2_s_s_x-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_z_c_x-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_c_z_x-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_c_c_x-f2c.c\ + $(SRC_PATH)/BLAS_cgbmv2_c_s_x-f2c.c\ + $(SRC_PATH)/BLAS_cgbmv2_s_c_x-f2c.c\ + $(SRC_PATH)/BLAS_cgbmv2_s_s_x-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_z_d_x-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_d_z_x-f2c.c\ + $(SRC_PATH)/BLAS_zgbmv2_d_d_x-f2c.c + +GBMV2_OBJS = $(GBMV2_SRCS:.c=.o) + +all: $(GBMV2_OBJS) + +lib: $(GBMV2_OBJS) + $(ARCH) $(ARCHFLAGS) $(LIB_PATH)/$(XBLASLIB) $(GBMV2_OBJS) + $(RANLIB) $(LIB_PATH)/$(XBLASLIB) + +.c.o: + $(CC) $(CFLAGS) -I$(HEADER_PATH) -c -o $@ $< + +clean: + rm -f *.o *~ *.BAK + diff --git a/XBLAS/testing/blas_extended_test.h b/XBLAS/testing/blas_extended_test.h index cab9b7d5..7f75b9bf 100644 --- a/XBLAS/testing/blas_extended_test.h +++ b/XBLAS/testing/blas_extended_test.h @@ -1071,13 +1071,110 @@ void BLAS_chemv2_c_s_testgen(int norm, enum blas_order_type order, void *y, int *seed, double *head_r_true, double *tail_r_true); -void BLAS_sge_sum_mv_testgen(int norm, enum blas_order_type order, - int m, int n, int randomize, - float *alpha, int alpha_flag, float *beta, - int beta_flag, float *a, int lda, float *b, - int ldb, float *x, int incx, - float *alpha_use_ptr, float *a_use, float *b_use, - int *seed, double *head_r_true, + +void BLAS_sgbmv2_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, float *alpha, int alpha_flag, float *AB, + int lda, float *x_head, float *x_tail, float *beta, + int beta_flag, float *y, int *seed, double *r_true_l, + double *r_true_t); +void BLAS_dgbmv2_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, double *alpha, int alpha_flag, double *AB, + int lda, double *x_head, double *x_tail, + double *beta, int beta_flag, double *y, int *seed, + double *r_true_l, double *r_true_t); +void BLAS_cgbmv2_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, double *r_true_l, + double *r_true_t); +void BLAS_zgbmv2_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, double *r_true_l, + double *r_true_t); +void BLAS_cgbmv2_s_s_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, float *AB, + int lda, float *x_head, float *x_tail, + void *beta, int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t); +void BLAS_cgbmv2_s_c_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, float *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t); +void BLAS_cgbmv2_c_s_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, float *x_head, float *x_tail, + void *beta, int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t); +void BLAS_zgbmv2_d_d_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, double *AB, + int lda, double *x_head, double *x_tail, + void *beta, int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t); +void BLAS_zgbmv2_d_z_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, double *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t); +void BLAS_zgbmv2_z_d_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, double *x_head, double *x_tail, + void *beta, int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t); +void BLAS_dgbmv2_s_s_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, double *alpha, int alpha_flag, float *AB, + int lda, float *x_head, float *x_tail, + double *beta, int beta_flag, double *y, + int *seed, double *r_true_l, double *r_true_t); +void BLAS_dgbmv2_s_d_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, double *alpha, int alpha_flag, float *AB, + int lda, double *x_head, double *x_tail, + double *beta, int beta_flag, double *y, + int *seed, double *r_true_l, double *r_true_t); +void BLAS_dgbmv2_d_s_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, double *alpha, int alpha_flag, + double *AB, int lda, float *x_head, + float *x_tail, double *beta, int beta_flag, + double *y, int *seed, double *r_true_l, + double *r_true_t); +void BLAS_zgbmv2_c_c_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t); +void BLAS_zgbmv2_c_z_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t); +void BLAS_zgbmv2_z_c_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t); +void BLAS_sge_sum_mv_testgen(int norm, enum blas_order_type order, int m, + int n, int randomize, float *alpha, + int alpha_flag, float *beta, int beta_flag, + float *a, int lda, float *b, int ldb, float *x, + int incx, float *alpha_use_ptr, float *a_use, + float *b_use, int *seed, double *head_r_true, double *tail_r_true); void BLAS_dge_sum_mv_testgen(int norm, enum blas_order_type order, int m, int n, int randomize, double *alpha, diff --git a/XBLAS/testing/test-gbmv2/BLAS_gbmv2_testgen.c b/XBLAS/testing/test-gbmv2/BLAS_gbmv2_testgen.c new file mode 100644 index 00000000..99564ff1 --- /dev/null +++ b/XBLAS/testing/test-gbmv2/BLAS_gbmv2_testgen.c @@ -0,0 +1,2315 @@ +#include "blas_extended.h" +#include "blas_extended_private.h" +#include "blas_extended_test.h" + + +void BLAS_sgbmv2_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, float *alpha, int alpha_flag, float *AB, + int lda, float *x_head, float *x_tail, float *beta, + int beta_flag, float *y, int *seed, double *r_true_l, + double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) float* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) float* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) float* + * x_tail (input/output) float* + * + * beta (input/output) float* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) float* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + float *x_head_i = x_head; + float *x_tail_i = x_tail; + float *y_i = y; + int n_fix2; + int n_mix; + int ysize; + int i; + + float *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + float y_elem; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + + + + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + sgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_sdot2_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, &y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem; + + + /* copy a_vec to AB */ + sgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_tail_i[i * incx] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_dgbmv2_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, double *alpha, int alpha_flag, double *AB, + int lda, double *x_head, double *x_tail, + double *beta, int beta_flag, double *y, int *seed, + double *r_true_l, double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) double* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) double* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) double* + * x_tail (input/output) double* + * + * beta (input/output) double* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) double* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + double *x_head_i = x_head; + double *x_tail_i = x_tail; + double *y_i = y; + int n_fix2; + int n_mix; + int ysize; + int i; + + double *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + double y_elem; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + + + + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + dgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_ddot2_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, &y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem; + + + /* copy a_vec to AB */ + dgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_tail_i[i * incx] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_cgbmv2_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, double *r_true_l, + double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) void* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) void* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) void* + * x_tail (input/output) void* + * + * beta (input/output) void* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) void* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + float *x_head_i = (float *) x_head; + float *x_tail_i = (float *) x_tail; + float *y_i = (float *) y; + int n_fix2; + int n_mix; + int ysize; + int i; + int j; + float *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + float y_elem[2]; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + incy *= 2; + incAB *= 2; + incx *= 2; + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + cgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_cdot2_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem[0]; + y_i[i * incy + 1] = y_elem[1]; + + if (trans == blas_conj_trans) { + for (j = 0; j < n_i * incAB; j += 2) { + a_vec[j + 1] = -a_vec[j + 1]; + } + } + /* copy a_vec to AB */ + cgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_head_i[i * incx + 1] = 0.0; + x_tail_i[i * incx] = 0.0; + x_tail_i[i * incx + 1] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_zgbmv2_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, double *r_true_l, + double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) void* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) void* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) void* + * x_tail (input/output) void* + * + * beta (input/output) void* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) void* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + double *x_head_i = (double *) x_head; + double *x_tail_i = (double *) x_tail; + double *y_i = (double *) y; + int n_fix2; + int n_mix; + int ysize; + int i; + int j; + double *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + double y_elem[2]; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + incy *= 2; + incAB *= 2; + incx *= 2; + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + zgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_zdot2_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem[0]; + y_i[i * incy + 1] = y_elem[1]; + + if (trans == blas_conj_trans) { + for (j = 0; j < n_i * incAB; j += 2) { + a_vec[j + 1] = -a_vec[j + 1]; + } + } + /* copy a_vec to AB */ + zgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_head_i[i * incx + 1] = 0.0; + x_tail_i[i * incx] = 0.0; + x_tail_i[i * incx + 1] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_cgbmv2_s_s_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, float *AB, + int lda, float *x_head, float *x_tail, + void *beta, int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) void* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) float* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) float* + * x_tail (input/output) float* + * + * beta (input/output) void* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) void* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + float *x_head_i = x_head; + float *x_tail_i = x_tail; + float *y_i = (float *) y; + int n_fix2; + int n_mix; + int ysize; + int i; + + float *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + float y_elem[2]; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + incy *= 2; + + + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + sgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_cdot2_s_s_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem[0]; + y_i[i * incy + 1] = y_elem[1]; + + + /* copy a_vec to AB */ + sgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_tail_i[i * incx] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_cgbmv2_s_c_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, float *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) void* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) float* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) void* + * x_tail (input/output) void* + * + * beta (input/output) void* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) void* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + float *x_head_i = (float *) x_head; + float *x_tail_i = (float *) x_tail; + float *y_i = (float *) y; + int n_fix2; + int n_mix; + int ysize; + int i; + + float *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + float y_elem[2]; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + incy *= 2; + + incx *= 2; + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + sgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_cdot2_c_s_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem[0]; + y_i[i * incy + 1] = y_elem[1]; + + + /* copy a_vec to AB */ + sgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_head_i[i * incx + 1] = 0.0; + x_tail_i[i * incx] = 0.0; + x_tail_i[i * incx + 1] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_cgbmv2_c_s_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, float *x_head, float *x_tail, + void *beta, int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) void* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) void* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) float* + * x_tail (input/output) float* + * + * beta (input/output) void* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) void* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + float *x_head_i = x_head; + float *x_tail_i = x_tail; + float *y_i = (float *) y; + int n_fix2; + int n_mix; + int ysize; + int i; + int j; + float *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + float y_elem[2]; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + incy *= 2; + incAB *= 2; + + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + cgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_cdot2_s_c_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem[0]; + y_i[i * incy + 1] = y_elem[1]; + + if (trans == blas_conj_trans) { + for (j = 0; j < n_i * incAB; j += 2) { + a_vec[j + 1] = -a_vec[j + 1]; + } + } + /* copy a_vec to AB */ + cgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_tail_i[i * incx] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_zgbmv2_d_d_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, double *AB, + int lda, double *x_head, double *x_tail, + void *beta, int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) void* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) double* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) double* + * x_tail (input/output) double* + * + * beta (input/output) void* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) void* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + double *x_head_i = x_head; + double *x_tail_i = x_tail; + double *y_i = (double *) y; + int n_fix2; + int n_mix; + int ysize; + int i; + + double *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + double y_elem[2]; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + incy *= 2; + + + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + dgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_zdot2_d_d_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem[0]; + y_i[i * incy + 1] = y_elem[1]; + + + /* copy a_vec to AB */ + dgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_tail_i[i * incx] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_zgbmv2_d_z_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, double *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) void* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) double* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) void* + * x_tail (input/output) void* + * + * beta (input/output) void* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) void* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + double *x_head_i = (double *) x_head; + double *x_tail_i = (double *) x_tail; + double *y_i = (double *) y; + int n_fix2; + int n_mix; + int ysize; + int i; + + double *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + double y_elem[2]; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + incy *= 2; + + incx *= 2; + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + dgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_zdot2_z_d_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem[0]; + y_i[i * incy + 1] = y_elem[1]; + + + /* copy a_vec to AB */ + dgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_head_i[i * incx + 1] = 0.0; + x_tail_i[i * incx] = 0.0; + x_tail_i[i * incx + 1] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_zgbmv2_z_d_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, double *x_head, double *x_tail, + void *beta, int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) void* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) void* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) double* + * x_tail (input/output) double* + * + * beta (input/output) void* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) void* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + double *x_head_i = x_head; + double *x_tail_i = x_tail; + double *y_i = (double *) y; + int n_fix2; + int n_mix; + int ysize; + int i; + int j; + double *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + double y_elem[2]; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + incy *= 2; + incAB *= 2; + + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + zgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_zdot2_d_z_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem[0]; + y_i[i * incy + 1] = y_elem[1]; + + if (trans == blas_conj_trans) { + for (j = 0; j < n_i * incAB; j += 2) { + a_vec[j + 1] = -a_vec[j + 1]; + } + } + /* copy a_vec to AB */ + zgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_tail_i[i * incx] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_dgbmv2_s_s_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, double *alpha, int alpha_flag, float *AB, + int lda, float *x_head, float *x_tail, + double *beta, int beta_flag, double *y, + int *seed, double *r_true_l, double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) double* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) float* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) float* + * x_tail (input/output) float* + * + * beta (input/output) double* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) double* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + float *x_head_i = x_head; + float *x_tail_i = x_tail; + double *y_i = y; + int n_fix2; + int n_mix; + int ysize; + int i; + + float *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + double y_elem; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + + + + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + sgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_ddot2_s_s_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, &y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem; + + + /* copy a_vec to AB */ + sgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_tail_i[i * incx] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_dgbmv2_s_d_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, double *alpha, int alpha_flag, float *AB, + int lda, double *x_head, double *x_tail, + double *beta, int beta_flag, double *y, + int *seed, double *r_true_l, double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) double* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) float* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) double* + * x_tail (input/output) double* + * + * beta (input/output) double* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) double* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + double *x_head_i = x_head; + double *x_tail_i = x_tail; + double *y_i = y; + int n_fix2; + int n_mix; + int ysize; + int i; + + float *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + double y_elem; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + + + + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + sgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_ddot2_d_s_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, &y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem; + + + /* copy a_vec to AB */ + sgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_tail_i[i * incx] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_dgbmv2_d_s_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, double *alpha, int alpha_flag, + double *AB, int lda, float *x_head, + float *x_tail, double *beta, int beta_flag, + double *y, int *seed, double *r_true_l, + double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) double* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) double* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) float* + * x_tail (input/output) float* + * + * beta (input/output) double* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) double* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + float *x_head_i = x_head; + float *x_tail_i = x_tail; + double *y_i = y; + int n_fix2; + int n_mix; + int ysize; + int i; + + double *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + double y_elem; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + + + + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + dgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_ddot2_s_d_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, &y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem; + + + /* copy a_vec to AB */ + dgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_tail_i[i * incx] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_zgbmv2_c_c_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) void* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) void* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) void* + * x_tail (input/output) void* + * + * beta (input/output) void* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) void* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + float *x_head_i = (float *) x_head; + float *x_tail_i = (float *) x_tail; + double *y_i = (double *) y; + int n_fix2; + int n_mix; + int ysize; + int i; + int j; + float *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + double y_elem[2]; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + incy *= 2; + incAB *= 2; + incx *= 2; + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + cgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_zdot2_c_c_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem[0]; + y_i[i * incy + 1] = y_elem[1]; + + if (trans == blas_conj_trans) { + for (j = 0; j < n_i * incAB; j += 2) { + a_vec[j + 1] = -a_vec[j + 1]; + } + } + /* copy a_vec to AB */ + cgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_head_i[i * incx + 1] = 0.0; + x_tail_i[i * incx] = 0.0; + x_tail_i[i * incx + 1] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_zgbmv2_c_z_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) void* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) void* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) void* + * x_tail (input/output) void* + * + * beta (input/output) void* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) void* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + double *x_head_i = (double *) x_head; + double *x_tail_i = (double *) x_tail; + double *y_i = (double *) y; + int n_fix2; + int n_mix; + int ysize; + int i; + int j; + float *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + double y_elem[2]; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + incy *= 2; + incAB *= 2; + incx *= 2; + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + cgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_zdot2_z_c_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem[0]; + y_i[i * incy + 1] = y_elem[1]; + + if (trans == blas_conj_trans) { + for (j = 0; j < n_i * incAB; j += 2) { + a_vec[j + 1] = -a_vec[j + 1]; + } + } + /* copy a_vec to AB */ + cgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_head_i[i * incx + 1] = 0.0; + x_tail_i[i * incx] = 0.0; + x_tail_i[i * incx + 1] = 0.0; + } + + blas_free(a_vec); +} +void BLAS_zgbmv2_z_c_testgen(int norm, enum blas_order_type order, + enum blas_trans_type trans, int m, int n, int kl, + int ku, void *alpha, int alpha_flag, void *AB, + int lda, void *x_head, void *x_tail, void *beta, + int beta_flag, void *y, int *seed, + double *r_true_l, double *r_true_t) + +/* + * Purpose + * ======= + * + * Generates alpha, AB, x, beta, and y, where AB is a banded + * matrix; and computes r_true. + * + * Arguments + * ========= + * + * norm (input) blas_norm_type + * + * order (input) blas_order_type + * Order of AB; row or column major + * + * trans (input) blas_trans_type + * Whether AB is no trans, trans, or conj trans + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * kl (input) int + * The number of subdiagonals + * + * ku (input) int + * The number of superdiagonals + * + * alpha (input/output) void* + * If alpha_flag = 1, alpha is input. + * If alpha_flag = 0, alpha is output. + * + * alpha_flag (input) int + * = 0 : alpha is free, and is output. + * = 1 : alpha is fixed on input. + * + * AB (output) void* + * Matrix A in the banded storage. + * + * + * lda (input) int + * The first dimension of AB + * + * x_head (input/output) void* + * x_tail (input/output) void* + * + * beta (input/output) void* + * If beta_flag = 1, beta is input. + * If beta_flag = 0, beta is output. + * + * beta_flag (input) int + * = 0 : beta is free, and is output. + * = 1 : beta is fixed on input. + * + * y (input/output) void* + * + * seed (input/output) int + * + * r_true_l (output) double* + * The leading part of the truth in double-double. + * + * r_true_t (output) double* + * The trailing part of the truth in double-double. + * + */ +{ + float *x_head_i = (float *) x_head; + float *x_tail_i = (float *) x_tail; + double *y_i = (double *) y; + int n_fix2; + int n_mix; + int ysize; + int i; + int j; + double *a_vec; + int m_i, n_i; + int max_mn; + int incy, incAB, incx; + double y_elem[2]; + + max_mn = MAX(m, n); + incx = incy = incAB = 1; + incy *= 2; + incAB *= 2; + incx *= 2; + + if (trans == blas_no_trans) { + m_i = m; + n_i = n; + } else { + m_i = n; + n_i = m; + } + + a_vec = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && a_vec == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* calling dot_testgen n time. in each iteration, one row of AB, and one + element of y are produced. the vector x is produced at the first + iteration only */ + for (i = 0; i < m_i; i++) { + /* copy AB to a_vec */ + zgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i, + &n_fix2, &n_mix, &ysize); + + if (i == 1) { + /* from now on, fix alpha and beta */ + alpha_flag = 1; + beta_flag = 1; + } + + BLAS_zdot2_c_z_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha, + alpha_flag, beta, beta_flag, x_head, x_tail, a_vec, + seed, y_elem, &r_true_l[i * incy], + &r_true_t[i * incy]); + y_i[i * incy] = y_elem[0]; + y_i[i * incy + 1] = y_elem[1]; + + if (trans == blas_conj_trans) { + for (j = 0; j < n_i * incAB; j += 2) { + a_vec[j + 1] = -a_vec[j + 1]; + } + } + /* copy a_vec to AB */ + zgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i); + } + + /* Zero out trailing part of x */ + for (i = ysize; i < n_i; i++) { + x_head_i[i * incx] = 0.0; + x_head_i[i * incx + 1] = 0.0; + x_tail_i[i * incx] = 0.0; + x_tail_i[i * incx + 1] = 0.0; + } + + blas_free(a_vec); +} diff --git a/XBLAS/testing/test-gbmv2/Makefile b/XBLAS/testing/test-gbmv2/Makefile new file mode 100644 index 00000000..533f0f49 --- /dev/null +++ b/XBLAS/testing/test-gbmv2/Makefile @@ -0,0 +1,43 @@ +include ../../make.conf +include ../../$(MAKEINC) + +TEST_DOT_PATH = ../test-dot +TEST_DOT2_PATH = ../test-dot2 +TEST_GBMV_PATH=../test-gbmv +LIB_PATH = ../../$(OUTPUT_DIR) +HEADER_PATH = -I../../src -I.. + +TEST_DOT_OBJS = $(TEST_DOT_PATH)/testgen_aux.o \ + $(TEST_DOT_PATH)/print_vector.o \ + $(TEST_DOT_PATH)/copy_vector.o +TEST_DOT2_OBJS = $(TEST_DOT2_PATH)/dot2.o \ + $(TEST_DOT2_PATH)/testgen_BLAS_sdot2.o \ + $(TEST_DOT2_PATH)/testgen_BLAS_ddot2.o \ + $(TEST_DOT2_PATH)/testgen_BLAS_cdot2.o \ + $(TEST_DOT2_PATH)/testgen_BLAS_zdot2.o \ + $(TEST_DOT2_PATH)/test_dot2.o \ + $(TEST_DOT2_PATH)/r_truth2.o \ + $(TEST_DOT2_PATH)/BLAS_dot2_testgen.o +TEST_GBMV_OBJS=../test-gbmv/gbmv-support.o + +TEST_SRCS = do_test_gbmv2.c BLAS_gbmv2_testgen.c +TEST_OBJS = $(TEST_SRCS:.c=.o) +COMMON_OBJS = ../common/dummy_main.o + +ALL_OBJS = $(TEST_OBJS) $(TEST_DOT_OBJS) $(TEST_DOT2_OBJS) \ + $(TEST_GBMV_OBJS) $(COMMON_OBJS) + +test: do_test_gbmv2 + @echo Testing GBMV2 + ./do_test_gbmv2 12 1 1.0 0 0.01 > gbmv2.results + +do_test_gbmv2: $(ALL_OBJS) $(LIB_PATH)/$(XBLASLIB) + $(LINKER) $(LDFLAGS) $(ALL_OBJS) $(LIB_PATH)/$(XBLASLIB) \ + -o do_test_gbmv2 $(EXTRA_LIBS) + +.c.o: + $(CC) $(CFLAGS) $(HEADER_PATH) -c -o $@ $< + +clean: + rm -f *.o *~ *.BAK gbmv2.results do_test_gbmv2 core + diff --git a/XBLAS/testing/test-gbmv2/do_test_gbmv2.c b/XBLAS/testing/test-gbmv2/do_test_gbmv2.c new file mode 100644 index 00000000..bc76502d --- /dev/null +++ b/XBLAS/testing/test-gbmv2/do_test_gbmv2.c @@ -0,0 +1,18419 @@ +#include <stdlib.h> +#include <stdio.h> +#include <math.h> +#include "blas_extended.h" +#include "blas_extended_private.h" +#include "blas_extended_test.h" + + +double do_test_dgbmv2_d_s(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_dgbmv2_d_s"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha; + double beta; + double *AB; + float *x_head; + float *x_tail; + double *y; + double *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_dgbmv2_d_s_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_dgbmv2_d_s_testgen */ + int order_val; + enum blas_order_type order_type = 0; + + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha = 0.0; + alpha_flag = 1; + break; + case 1: + alpha = 1.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta = 0.0; + beta_flag = 1; + break; + case 1: + beta = 1.0; + beta_flag = 1; + break; + } + + + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_dgbmv2_d_s_testgen(norm, order_type, trans_type, + m, n, kl, ku, &alpha, + alpha_flag, AB, lda, + x_head_gen, x_tail_gen, &beta, + beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + scopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + + + dcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_dgbmv2_d_s */ + FPU_FIX_STOP; + BLAS_dgbmv2_d_s(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_ddot2_d_s(n_i, blas_no_conj, alpha, + beta, y_gen[j * incy_gen], + y[iy], + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + dprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) { + if (j < n_i) { + printf(" "); + printf("%16.8e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%16.8e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("%24.16e", y_gen[k * incy_gen]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("%24.16e", y[iy]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("%24.16e", alpha); + printf("\n "); + printf("beta = "); + printf("%24.16e", beta); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf("[%24.16e, %24.16e]", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_dgbmv2_s_d(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_dgbmv2_s_d"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha; + double beta; + float *AB; + double *x_head; + double *x_tail; + double *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + double *x_head_gen; + double *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_dgbmv2_s_d_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_dgbmv2_s_d_testgen */ + int order_val; + enum blas_order_type order_type = 0; + + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + + + /* get space for calculation */ + x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha = 0.0; + alpha_flag = 1; + break; + case 1: + alpha = 1.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta = 0.0; + beta_flag = 1; + break; + case 1: + beta = 1.0; + beta_flag = 1; + break; + } + + + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_dgbmv2_s_d_testgen(norm, order_type, trans_type, + m, n, kl, ku, &alpha, + alpha_flag, AB, lda, + x_head_gen, x_tail_gen, &beta, + beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + + + dcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_dgbmv2_s_d */ + FPU_FIX_STOP; + BLAS_dgbmv2_s_d(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_ddot2_s_d(n_i, blas_no_conj, alpha, + beta, y_gen[j * incy_gen], + y[iy], + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + sprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) { + if (j < n_i) { + printf(" "); + printf("%24.16e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%24.16e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("%24.16e", y_gen[k * incy_gen]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("%24.16e", y[iy]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("%24.16e", alpha); + printf("\n "); + printf("beta = "); + printf("%24.16e", beta); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf("[%24.16e, %24.16e]", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_dgbmv2_s_s(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_dgbmv2_s_s"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha; + double beta; + float *AB; + float *x_head; + float *x_tail; + double *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_dgbmv2_s_s_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_dgbmv2_s_s_testgen */ + int order_val; + enum blas_order_type order_type = 0; + + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha = 0.0; + alpha_flag = 1; + break; + case 1: + alpha = 1.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta = 0.0; + beta_flag = 1; + break; + case 1: + beta = 1.0; + beta_flag = 1; + break; + } + + + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_dgbmv2_s_s_testgen(norm, order_type, trans_type, + m, n, kl, ku, &alpha, + alpha_flag, AB, lda, + x_head_gen, x_tail_gen, &beta, + beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + scopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + + + dcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_dgbmv2_s_s */ + FPU_FIX_STOP; + BLAS_dgbmv2_s_s(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_ddot2_s_s(n_i, blas_no_conj, alpha, + beta, y_gen[j * incy_gen], + y[iy], + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + sprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) { + if (j < n_i) { + printf(" "); + printf("%16.8e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%16.8e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("%24.16e", y_gen[k * incy_gen]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("%24.16e", y[iy]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("%24.16e", alpha); + printf("\n "); + printf("beta = "); + printf("%24.16e", beta); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf("[%24.16e, %24.16e]", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_z_c(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_z_c"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + double *AB; + float *x_head; + float *x_tail; + double *y; + double *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_z_c_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_z_c_testgen */ + int order_val; + enum blas_order_type order_type = 0; + + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + incx_gen *= 2; + incy_gen *= 2; + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double) * + 2); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_z_c_testgen(norm, order_type, trans_type, + m, n, kl, ku, &alpha, + alpha_flag, AB, lda, + x_head_gen, x_tail_gen, &beta, + beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + incx *= 2; + + ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_z_c */ + FPU_FIX_STOP; + BLAS_zgbmv2_z_c(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + zgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2_z_c(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + zgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + zprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) { + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_head[ix], + x_head[ix + 1]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_tail[ix], + x_tail[ix + 1]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_c_z(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_c_z"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + float *AB; + double *x_head; + double *x_tail; + double *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + double *x_head_gen; + double *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_c_z_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_c_z_testgen */ + int order_val; + enum blas_order_type order_type = 0; + + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + incx_gen *= 2; + incy_gen *= 2; + + /* get space for calculation */ + x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) * + 2); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_c_z_testgen(norm, order_type, trans_type, + m, n, kl, ku, &alpha, + alpha_flag, AB, lda, + x_head_gen, x_tail_gen, &beta, + beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + incx *= 2; + + zcopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + zcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_c_z */ + FPU_FIX_STOP; + BLAS_zgbmv2_c_z(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2_c_z(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + cprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) { + if (j < n_i) { + printf(" "); + printf("(%24.16e, %24.16e)", x_head[ix], + x_head[ix + 1]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("(%24.16e, %24.16e)", x_tail[ix], + x_tail[ix + 1]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_c_c(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_c_c"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + float *AB; + float *x_head; + float *x_tail; + double *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_c_c_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_c_c_testgen */ + int order_val; + enum blas_order_type order_type = 0; + + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + incx_gen *= 2; + incy_gen *= 2; + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) * + 2); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_c_c_testgen(norm, order_type, trans_type, + m, n, kl, ku, &alpha, + alpha_flag, AB, lda, + x_head_gen, x_tail_gen, &beta, + beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + incx *= 2; + + ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_c_c */ + FPU_FIX_STOP; + BLAS_zgbmv2_c_c(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2_c_c(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + cprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) { + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_head[ix], + x_head[ix + 1]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_tail[ix], + x_tail[ix + 1]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_cgbmv2_c_s(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_cgbmv2_c_s"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + float alpha[2]; + float beta[2]; + float *AB; + float *x_head; + float *x_tail; + float *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + float *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_cgbmv2_c_s_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_cgbmv2_c_s_testgen */ + int order_val; + enum blas_order_type order_type = 0; + + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + incy_gen *= 2; + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) * + 2); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + eps_int = power(2, -BITS_S); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_single)); + prec = blas_prec_single; + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_cgbmv2_c_s_testgen(norm, order_type, trans_type, + m, n, kl, ku, &alpha, + alpha_flag, AB, lda, + x_head_gen, x_tail_gen, &beta, + beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + scopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + ccopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_cgbmv2_c_s */ + FPU_FIX_STOP; + BLAS_cgbmv2_c_s(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_cdot2_c_s(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + cprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) { + if (j < n_i) { + printf(" "); + printf("%16.8e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%16.8e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%16.8e, %16.8e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%16.8e, %16.8e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%16.8e, %16.8e)", alpha[0], alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%16.8e, %16.8e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_cgbmv2_s_c(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_cgbmv2_s_c"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + float alpha[2]; + float beta[2]; + float *AB; + float *x_head; + float *x_tail; + float *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + float *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_cgbmv2_s_c_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_cgbmv2_s_c_testgen */ + int order_val; + enum blas_order_type order_type = 0; + + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + incx_gen *= 2; + incy_gen *= 2; + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + eps_int = power(2, -BITS_S); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_single)); + prec = blas_prec_single; + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_cgbmv2_s_c_testgen(norm, order_type, trans_type, + m, n, kl, ku, &alpha, + alpha_flag, AB, lda, + x_head_gen, x_tail_gen, &beta, + beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + incx *= 2; + + ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + ccopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_cgbmv2_s_c */ + FPU_FIX_STOP; + BLAS_cgbmv2_s_c(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_cdot2_s_c(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + sprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) { + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_head[ix], + x_head[ix + 1]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_tail[ix], + x_tail[ix + 1]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%16.8e, %16.8e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%16.8e, %16.8e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%16.8e, %16.8e)", alpha[0], alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%16.8e, %16.8e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_cgbmv2_s_s(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_cgbmv2_s_s"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + float alpha[2]; + float beta[2]; + float *AB; + float *x_head; + float *x_tail; + float *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + float *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_cgbmv2_s_s_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_cgbmv2_s_s_testgen */ + int order_val; + enum blas_order_type order_type = 0; + + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + incy_gen *= 2; + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + eps_int = power(2, -BITS_S); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_single)); + prec = blas_prec_single; + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_cgbmv2_s_s_testgen(norm, order_type, trans_type, + m, n, kl, ku, &alpha, + alpha_flag, AB, lda, + x_head_gen, x_tail_gen, &beta, + beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + scopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + ccopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_cgbmv2_s_s */ + FPU_FIX_STOP; + BLAS_cgbmv2_s_s(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_cdot2_s_s(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + sprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) { + if (j < n_i) { + printf(" "); + printf("%16.8e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%16.8e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%16.8e, %16.8e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%16.8e, %16.8e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%16.8e, %16.8e)", alpha[0], alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%16.8e, %16.8e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_z_d(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_z_d"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + double *AB; + double *x_head; + double *x_tail; + double *y; + double *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + double *x_head_gen; + double *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_z_d_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_z_d_testgen */ + int order_val; + enum blas_order_type order_type = 0; + + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + incy_gen *= 2; + + /* get space for calculation */ + x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double) * + 2); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_z_d_testgen(norm, order_type, trans_type, + m, n, kl, ku, &alpha, + alpha_flag, AB, lda, + x_head_gen, x_tail_gen, &beta, + beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_z_d */ + FPU_FIX_STOP; + BLAS_zgbmv2_z_d(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + zgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2_z_d(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + zgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + zprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) { + if (j < n_i) { + printf(" "); + printf("%24.16e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%24.16e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_d_z(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_d_z"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + double *AB; + double *x_head; + double *x_tail; + double *y; + double *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + double *x_head_gen; + double *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_d_z_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_d_z_testgen */ + int order_val; + enum blas_order_type order_type = 0; + + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + incx_gen *= 2; + incy_gen *= 2; + + /* get space for calculation */ + x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_d_z_testgen(norm, order_type, trans_type, + m, n, kl, ku, &alpha, + alpha_flag, AB, lda, + x_head_gen, x_tail_gen, &beta, + beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + incx *= 2; + + zcopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + zcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_d_z */ + FPU_FIX_STOP; + BLAS_zgbmv2_d_z(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2_d_z(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + dprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) { + if (j < n_i) { + printf(" "); + printf("(%24.16e, %24.16e)", x_head[ix], + x_head[ix + 1]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("(%24.16e, %24.16e)", x_tail[ix], + x_tail[ix + 1]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_d_d(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_d_d"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + double *AB; + double *x_head; + double *x_tail; + double *y; + double *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + double *x_head_gen; + double *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_d_d_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_d_d_testgen */ + int order_val; + enum blas_order_type order_type = 0; + + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + incy_gen *= 2; + + /* get space for calculation */ + x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_d_d_testgen(norm, order_type, trans_type, + m, n, kl, ku, &alpha, + alpha_flag, AB, lda, + x_head_gen, x_tail_gen, &beta, + beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_d_d */ + FPU_FIX_STOP; + BLAS_zgbmv2_d_d(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2_d_d(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + dprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) { + if (j < n_i) { + printf(" "); + printf("%24.16e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%24.16e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_sgbmv2_x(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_sgbmv2_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + float alpha; + float beta; + float *AB; + float *x_head; + float *x_tail; + float *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + float *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_sgbmv2_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_sgbmv2_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha = 0.0; + alpha_flag = 1; + break; + case 1: + alpha = 1.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta = 0.0; + beta_flag = 1; + break; + case 1: + beta = 1.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_S); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_single)); + prec = blas_prec_single; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_sgbmv2_testgen(norm, order_type, trans_type, m, + n, kl, ku, &alpha, alpha_flag, + AB, lda, x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + scopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + + + scopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_sgbmv2_x */ + FPU_FIX_STOP; + BLAS_sgbmv2_x(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_sdot2(n_i, blas_no_conj, alpha, beta, + y_gen[j * incy_gen], + y[iy], + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], temp, + 1, x_head, x_tail, incx_val, + eps_int, un_int, &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + sprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("%16.8e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%16.8e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("%16.8e", y_gen[k * incy_gen]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("%16.8e", y[iy]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("%16.8e", alpha); + printf("\n "); + printf("beta = "); + printf("%16.8e", beta); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf("[%24.16e, %24.16e]", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_dgbmv2_x(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_dgbmv2_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha; + double beta; + double *AB; + double *x_head; + double *x_tail; + double *y; + double *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + double *x_head_gen; + double *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_dgbmv2_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_dgbmv2_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + + + /* get space for calculation */ + x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha = 0.0; + alpha_flag = 1; + break; + case 1: + alpha = 1.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta = 0.0; + beta_flag = 1; + break; + case 1: + beta = 1.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_dgbmv2_testgen(norm, order_type, trans_type, m, + n, kl, ku, &alpha, alpha_flag, + AB, lda, x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + + + dcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_dgbmv2_x */ + FPU_FIX_STOP; + BLAS_dgbmv2_x(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_ddot2(n_i, blas_no_conj, alpha, beta, + y_gen[j * incy_gen], + y[iy], + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], temp, + 1, x_head, x_tail, incx_val, + eps_int, un_int, &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + dprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("%24.16e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%24.16e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("%24.16e", y_gen[k * incy_gen]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("%24.16e", y[iy]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("%24.16e", alpha); + printf("\n "); + printf("beta = "); + printf("%24.16e", beta); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf("[%24.16e, %24.16e]", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_cgbmv2_x(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_cgbmv2_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + float alpha[2]; + float beta[2]; + float *AB; + float *x_head; + float *x_tail; + float *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + float *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_cgbmv2_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_cgbmv2_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + incx_gen *= 2; + incy_gen *= 2; + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) * + 2); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_S); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_single)); + prec = blas_prec_single; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_cgbmv2_testgen(norm, order_type, trans_type, m, + n, kl, ku, &alpha, alpha_flag, + AB, lda, x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + incx *= 2; + + ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + ccopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_cgbmv2_x */ + FPU_FIX_STOP; + BLAS_cgbmv2_x(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_cdot2(n_i, blas_no_conj, alpha, beta, + &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + cprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_head[ix], + x_head[ix + 1]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_tail[ix], + x_tail[ix + 1]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%16.8e, %16.8e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%16.8e, %16.8e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%16.8e, %16.8e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%16.8e, %16.8e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_x(int m, int n, int ntests, int *seed, double thresh, + int debug, float test_prob, double *min_ratio, + int *num_bad_ratio, int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + double *AB; + double *x_head; + double *x_tail; + double *y; + double *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + double *x_head_gen; + double *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + incx_gen *= 2; + incy_gen *= 2; + + /* get space for calculation */ + x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double) * + 2); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_testgen(norm, order_type, trans_type, m, + n, kl, ku, &alpha, alpha_flag, + AB, lda, x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, seed, + head_r_true, tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + incx *= 2; + + zcopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + zcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_x */ + FPU_FIX_STOP; + BLAS_zgbmv2_x(order_type, trans_type, m, n, kl, + ku, alpha, AB, lda, x_head, x_tail, + incx_val, beta, y, incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + zgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2(n_i, blas_no_conj, alpha, beta, + &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + zgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + zprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("(%24.16e, %24.16e)", x_head[ix], + x_head[ix + 1]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("(%24.16e, %24.16e)", x_tail[ix], + x_tail[ix + 1]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], + beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_dgbmv2_d_s_x(int m, int n, int ntests, int *seed, + double thresh, int debug, float test_prob, + double *min_ratio, int *num_bad_ratio, + int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_dgbmv2_d_s_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha; + double beta; + double *AB; + float *x_head; + float *x_tail; + double *y; + double *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_dgbmv2_d_s_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_dgbmv2_d_s_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha = 0.0; + alpha_flag = 1; + break; + case 1: + alpha = 1.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta = 0.0; + beta_flag = 1; + break; + case 1: + beta = 1.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_dgbmv2_d_s_testgen(norm, order_type, + trans_type, m, n, kl, ku, + &alpha, alpha_flag, AB, lda, + x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, + seed, head_r_true, + tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + scopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + + + dcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_dgbmv2_d_s_x */ + FPU_FIX_STOP; + BLAS_dgbmv2_d_s_x(order_type, trans_type, m, n, + kl, ku, alpha, AB, lda, x_head, + x_tail, incx_val, beta, y, + incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_ddot2_d_s(n_i, blas_no_conj, alpha, + beta, y_gen[j * incy_gen], + y[iy], + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + dprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("%16.8e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%16.8e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("%24.16e", y_gen[k * incy_gen]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("%24.16e", y[iy]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("%24.16e", alpha); + printf("\n "); + printf("beta = "); + printf("%24.16e", beta); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf("[%24.16e, %24.16e]", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_dgbmv2_s_d_x(int m, int n, int ntests, int *seed, + double thresh, int debug, float test_prob, + double *min_ratio, int *num_bad_ratio, + int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_dgbmv2_s_d_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha; + double beta; + float *AB; + double *x_head; + double *x_tail; + double *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + double *x_head_gen; + double *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_dgbmv2_s_d_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_dgbmv2_s_d_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + + + /* get space for calculation */ + x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha = 0.0; + alpha_flag = 1; + break; + case 1: + alpha = 1.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta = 0.0; + beta_flag = 1; + break; + case 1: + beta = 1.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_dgbmv2_s_d_testgen(norm, order_type, + trans_type, m, n, kl, ku, + &alpha, alpha_flag, AB, lda, + x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, + seed, head_r_true, + tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + + + dcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_dgbmv2_s_d_x */ + FPU_FIX_STOP; + BLAS_dgbmv2_s_d_x(order_type, trans_type, m, n, + kl, ku, alpha, AB, lda, x_head, + x_tail, incx_val, beta, y, + incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_ddot2_s_d(n_i, blas_no_conj, alpha, + beta, y_gen[j * incy_gen], + y[iy], + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + sprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("%24.16e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%24.16e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("%24.16e", y_gen[k * incy_gen]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("%24.16e", y[iy]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("%24.16e", alpha); + printf("\n "); + printf("beta = "); + printf("%24.16e", beta); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf("[%24.16e, %24.16e]", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_dgbmv2_s_s_x(int m, int n, int ntests, int *seed, + double thresh, int debug, float test_prob, + double *min_ratio, int *num_bad_ratio, + int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_dgbmv2_s_s_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha; + double beta; + float *AB; + float *x_head; + float *x_tail; + double *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_dgbmv2_s_s_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_dgbmv2_s_s_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha = 0.0; + alpha_flag = 1; + break; + case 1: + alpha = 1.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta = 0.0; + beta_flag = 1; + break; + case 1: + beta = 1.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_dgbmv2_s_s_testgen(norm, order_type, + trans_type, m, n, kl, ku, + &alpha, alpha_flag, AB, lda, + x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, + seed, head_r_true, + tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + scopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + + + dcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_dgbmv2_s_s_x */ + FPU_FIX_STOP; + BLAS_dgbmv2_s_s_x(order_type, trans_type, m, n, + kl, ku, alpha, AB, lda, x_head, + x_tail, incx_val, beta, y, + incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_ddot2_s_s(n_i, blas_no_conj, alpha, + beta, y_gen[j * incy_gen], + y[iy], + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + sprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("%16.8e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%16.8e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("%24.16e", y_gen[k * incy_gen]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("%24.16e", y[iy]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("%24.16e", alpha); + printf("\n "); + printf("beta = "); + printf("%24.16e", beta); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf("[%24.16e, %24.16e]", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_z_c_x(int m, int n, int ntests, int *seed, + double thresh, int debug, float test_prob, + double *min_ratio, int *num_bad_ratio, + int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_z_c_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + double *AB; + float *x_head; + float *x_tail; + double *y; + double *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_z_c_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_z_c_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + incx_gen *= 2; + incy_gen *= 2; + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double) * + 2); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_z_c_testgen(norm, order_type, + trans_type, m, n, kl, ku, + &alpha, alpha_flag, AB, lda, + x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, + seed, head_r_true, + tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + incx *= 2; + + ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_z_c_x */ + FPU_FIX_STOP; + BLAS_zgbmv2_z_c_x(order_type, trans_type, m, n, + kl, ku, alpha, AB, lda, x_head, + x_tail, incx_val, beta, y, + incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + zgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2_z_c(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + zgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + zprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_head[ix], + x_head[ix + 1]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_tail[ix], + x_tail[ix + 1]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], + beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_c_z_x(int m, int n, int ntests, int *seed, + double thresh, int debug, float test_prob, + double *min_ratio, int *num_bad_ratio, + int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_c_z_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + float *AB; + double *x_head; + double *x_tail; + double *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + double *x_head_gen; + double *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_c_z_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_c_z_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + incx_gen *= 2; + incy_gen *= 2; + + /* get space for calculation */ + x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) * + 2); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_c_z_testgen(norm, order_type, + trans_type, m, n, kl, ku, + &alpha, alpha_flag, AB, lda, + x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, + seed, head_r_true, + tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + incx *= 2; + + zcopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + zcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_c_z_x */ + FPU_FIX_STOP; + BLAS_zgbmv2_c_z_x(order_type, trans_type, m, n, + kl, ku, alpha, AB, lda, x_head, + x_tail, incx_val, beta, y, + incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2_c_z(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + cprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("(%24.16e, %24.16e)", x_head[ix], + x_head[ix + 1]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("(%24.16e, %24.16e)", x_tail[ix], + x_tail[ix + 1]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], + beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_c_c_x(int m, int n, int ntests, int *seed, + double thresh, int debug, float test_prob, + double *min_ratio, int *num_bad_ratio, + int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_c_c_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + float *AB; + float *x_head; + float *x_tail; + double *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_c_c_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_c_c_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + incx_gen *= 2; + incy_gen *= 2; + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) * + 2); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_c_c_testgen(norm, order_type, + trans_type, m, n, kl, ku, + &alpha, alpha_flag, AB, lda, + x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, + seed, head_r_true, + tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + incx *= 2; + + ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_c_c_x */ + FPU_FIX_STOP; + BLAS_zgbmv2_c_c_x(order_type, trans_type, m, n, + kl, ku, alpha, AB, lda, x_head, + x_tail, incx_val, beta, y, + incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2_c_c(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + cprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_head[ix], + x_head[ix + 1]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_tail[ix], + x_tail[ix + 1]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], + beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_cgbmv2_c_s_x(int m, int n, int ntests, int *seed, + double thresh, int debug, float test_prob, + double *min_ratio, int *num_bad_ratio, + int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_cgbmv2_c_s_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + float alpha[2]; + float beta[2]; + float *AB; + float *x_head; + float *x_tail; + float *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + float *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_cgbmv2_c_s_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_cgbmv2_c_s_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + incy_gen *= 2; + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) * + 2); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_S); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_single)); + prec = blas_prec_single; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_cgbmv2_c_s_testgen(norm, order_type, + trans_type, m, n, kl, ku, + &alpha, alpha_flag, AB, lda, + x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, + seed, head_r_true, + tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + scopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + ccopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_cgbmv2_c_s_x */ + FPU_FIX_STOP; + BLAS_cgbmv2_c_s_x(order_type, trans_type, m, n, + kl, ku, alpha, AB, lda, x_head, + x_tail, incx_val, beta, y, + incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_cdot2_c_s(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + cgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + cprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("%16.8e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%16.8e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%16.8e, %16.8e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%16.8e, %16.8e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%16.8e, %16.8e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%16.8e, %16.8e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_cgbmv2_s_c_x(int m, int n, int ntests, int *seed, + double thresh, int debug, float test_prob, + double *min_ratio, int *num_bad_ratio, + int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_cgbmv2_s_c_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + float alpha[2]; + float beta[2]; + float *AB; + float *x_head; + float *x_tail; + float *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + float *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_cgbmv2_s_c_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_cgbmv2_s_c_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + incx_gen *= 2; + incy_gen *= 2; + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_S); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_single)); + prec = blas_prec_single; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_cgbmv2_s_c_testgen(norm, order_type, + trans_type, m, n, kl, ku, + &alpha, alpha_flag, AB, lda, + x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, + seed, head_r_true, + tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + incx *= 2; + + ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + ccopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_cgbmv2_s_c_x */ + FPU_FIX_STOP; + BLAS_cgbmv2_s_c_x(order_type, trans_type, m, n, + kl, ku, alpha, AB, lda, x_head, + x_tail, incx_val, beta, y, + incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_cdot2_s_c(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + sprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_head[ix], + x_head[ix + 1]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("(%16.8e, %16.8e)", x_tail[ix], + x_tail[ix + 1]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%16.8e, %16.8e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%16.8e, %16.8e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%16.8e, %16.8e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%16.8e, %16.8e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_cgbmv2_s_s_x(int m, int n, int ntests, int *seed, + double thresh, int debug, float test_prob, + double *min_ratio, int *num_bad_ratio, + int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_cgbmv2_s_s_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + float alpha[2]; + float beta[2]; + float *AB; + float *x_head; + float *x_tail; + float *y; + float *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + float *x_head_gen; + float *x_tail_gen; + float *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_cgbmv2_s_s_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_cgbmv2_s_s_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + incy_gen *= 2; + + /* get space for calculation */ + x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (float *) blas_malloc(max_mn * sizeof(float)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_S); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_single)); + prec = blas_prec_single; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_cgbmv2_s_s_testgen(norm, order_type, + trans_type, m, n, kl, ku, + &alpha, alpha_flag, AB, lda, + x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, + seed, head_r_true, + tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + scopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + ccopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_cgbmv2_s_s_x */ + FPU_FIX_STOP; + BLAS_cgbmv2_s_s_x(order_type, trans_type, m, n, + kl, ku, alpha, AB, lda, x_head, + x_tail, incx_val, beta, y, + incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_cdot2_s_s(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + sgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + sprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("%16.8e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%16.8e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%16.8e, %16.8e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%16.8e, %16.8e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%16.8e, %16.8e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%16.8e, %16.8e)", beta[0], beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_z_d_x(int m, int n, int ntests, int *seed, + double thresh, int debug, float test_prob, + double *min_ratio, int *num_bad_ratio, + int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_z_d_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + double *AB; + double *x_head; + double *x_tail; + double *y; + double *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + double *x_head_gen; + double *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_z_d_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_z_d_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + incy_gen *= 2; + + /* get space for calculation */ + x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double) * + 2); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_z_d_testgen(norm, order_type, + trans_type, m, n, kl, ku, + &alpha, alpha_flag, AB, lda, + x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, + seed, head_r_true, + tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_z_d_x */ + FPU_FIX_STOP; + BLAS_zgbmv2_z_d_x(order_type, trans_type, m, n, + kl, ku, alpha, AB, lda, x_head, + x_tail, incx_val, beta, y, + incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + zgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2_z_d(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + zgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + zprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("%24.16e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%24.16e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], + beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_d_z_x(int m, int n, int ntests, int *seed, + double thresh, int debug, float test_prob, + double *min_ratio, int *num_bad_ratio, + int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_d_z_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + double *AB; + double *x_head; + double *x_tail; + double *y; + double *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + double *x_head_gen; + double *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_d_z_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_d_z_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + incx_gen *= 2; + incy_gen *= 2; + + /* get space for calculation */ + x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_d_z_testgen(norm, order_type, + trans_type, m, n, kl, ku, + &alpha, alpha_flag, AB, lda, + x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, + seed, head_r_true, + tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + incx *= 2; + + zcopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + zcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_d_z_x */ + FPU_FIX_STOP; + BLAS_zgbmv2_d_z_x(order_type, trans_type, m, n, + kl, ku, alpha, AB, lda, x_head, + x_tail, incx_val, beta, y, + incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2_d_z(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + dprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("(%24.16e, %24.16e)", x_head[ix], + x_head[ix + 1]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("(%24.16e, %24.16e)", x_tail[ix], + x_tail[ix + 1]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], + beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} +double do_test_zgbmv2_d_d_x(int m, int n, int ntests, int *seed, + double thresh, int debug, float test_prob, + double *min_ratio, int *num_bad_ratio, + int *num_tests) + +/* + * Purpose + * ======= + * + * Runs a series of tests on GBMV2. + * + * Arguments + * ========= + * + * m (input) int + * The number of rows + * + * n (input) int + * The number of columns + * + * ntests (input) int + * The number of tests to run for each set of attributes. + * + * seed (input/output) int + * The seed for the random number generator used in testgen(). + * + * thresh (input) double + * When the ratio returned from test() exceeds the specified + * threshold, the current size, r_true, r_comp, and ratio will be + * printed. (Since ratio is supposed to be O(1), we can set thresh + * to ~10.) + * + * debug (input) int + * If debug=3, print summary + * If debug=2, print summary only if the number of bad ratios > 0 + * If debug=1, print complete info if tests fail + * If debug=0, return max ratio + * + * test_prob (input) float + * The specified test will be performed only if the generated + * random exceeds this threshold. + * + * min_ratio (output) double + * The minimum ratio + * + * num_bad_ratio (output) int + * The number of tests fail; they are above the threshold. + * + * num_tests (output) int + * The number of tests is being performed. + * + * Return value + * ============ + * + * The maximum ratio if run successfully, otherwise return -1 + * + * Code structure + * ============== + * + * debug loop -- if debug is one, the first loop computes the max ratio + * -- and the last(second) loop outputs debugging information, + * -- if the test fail and its ratio > 0.5 * max ratio. + * -- if debug is zero, the loop is executed once + * alpha loop -- varying alpha: 0, 1, or random + * beta loop -- varying beta: 0, 1, or random + * prec loop -- varying internal prec: single, double, or extra + * norm loop -- varying norm: near undeflow, near one, or + * -- near overflow + * numtest loop -- how many times the test is perform with + * -- above set of attributes + * order loop -- varying order type: rowmajor or colmajor + * trans loop -- varying trans type: no trans, trans, or conj trans + * ku loop -- varying ku: 0 to n-1 + * kl loop -- varying kl: 0 to m-1 + * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1) + * incx loop -- varying incx: -2, -1, 1, 2 + * incy loop -- varying incy: -2, -1, 1, 2 + */ +{ + /* function name */ + const char fname[] = "BLAS_zgbmv2_d_d_x"; + + /* max number of debug lines to print */ + const int max_print = 8; + + /* Variables in the "x_val" form are loop vars for corresponding + variables */ + int i; /* iterate through the repeating tests */ + int j; + int k; /* multipurpose counters or variables */ + int ix, iy; /* use to index x and y respectively */ + int incx_val, incy_val, /* for testing different inc values */ + incx, incy; + int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */ + int d_count; /* counter for debug */ + int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */ + int p_count; /* counter for the number of debug lines printed */ + int tot_tests; /* total number of tests to be done */ + int norm; /* input values of near underflow/one/overflow */ + double ratio_max; /* the current maximum ratio */ + double ratio_min; /* the current minimum ratio */ + double *ratios; /* a temporary variable for calculating ratio */ + double ratio; /* the per-use test ratio from test() */ + int bad_ratios = 0; /* the number of ratios over the threshold */ + double eps_int; /* the internal epsilon expected--2^(-24) for float */ + double un_int; /* the internal underflow threshold */ + double alpha[2]; + double beta[2]; + double *AB; + double *x_head; + double *x_tail; + double *y; + double *temp; /* use for calculating ratio */ + + /* x_gen and y_gen are used to store vectors generated by testgen. + they eventually are copied back to x and y */ + double *x_head_gen; + double *x_tail_gen; + double *y_gen; + + /* the true r calculated by testgen(), in double-double */ + double *head_r_true, *tail_r_true; + + int alpha_val; + int alpha_flag = 0; /* input flag for BLAS_zgbmv2_d_d_testgen */ + int beta_val; + int beta_flag = 0; /* input flag for BLAS_zgbmv2_d_d_testgen */ + int order_val; + enum blas_order_type order_type = 0; + int prec_val; + enum blas_prec_type prec = 0; + int trans_val; + enum blas_trans_type trans_type = 0; + int m_i = 0; + int n_i = 0; + int max_mn; /* the max of m and n */ + int ku; + int kl; + int lda_val; + int lda = 0; + int saved_seed; /* for saving the original seed */ + + /* use for counting the number of testgen calls * 2 */ + int count, old_count = -1; + + FPU_FIX_DECL; + + /* test for bad arguments */ + if (n < 0 || m < 0 || ntests < 0) + BLAS_error(fname, 0, 0, NULL); + + /* initialization */ + *min_ratio = 0.0; + *num_bad_ratio = 0; + *num_tests = 0; + + saved_seed = *seed; + ratio_min = 1e308; + ratio_max = 0.0; + ratio = 0.0; + tot_tests = 0; + p_count = 0; + count = 0; + find_max_ratio = 0; + if (debug == 3) + find_max_ratio = 1; + max_mn = MAX(m, n); + + if (m == 0 || n == 0) { + return 0.0; + } + + FPU_FIX_START; + + incx_gen = incy_gen = 1; + + incy_gen *= 2; + + /* get space for calculation */ + x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_head == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double)); + if (max_mn * 2 > 0 && x_tail == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2); + if (max_mn * 2 > 0 && y == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_head_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_head_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && x_tail_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && y_gen == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + temp = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && temp == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2); + if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + ratios = (double *) blas_malloc(max_mn * sizeof(double)); + if (max_mn > 0 && ratios == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + AB = + (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double)); + if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) { + BLAS_error("blas_malloc", 0, 0, "malloc failed.\n"); + } + + /* The debug iteration: + If debug=1, then will execute the iteration twice. First, compute the + max ratio. Second, print info if ratio > (50% * ratio_max). */ + for (d_count = 0; d_count <= find_max_ratio; d_count++) { + bad_ratios = 0; /* set to zero */ + + if ((debug == 3) && (d_count == find_max_ratio)) + *seed = saved_seed; /* restore the original seed */ + + /* varying alpha */ + for (alpha_val = 0; alpha_val < 3; alpha_val++) { + alpha_flag = 0; + switch (alpha_val) { + case 0: + alpha[0] = alpha[1] = 0.0; + alpha_flag = 1; + break; + case 1: + alpha[0] = 1.0; + alpha[1] = 0.0; + alpha_flag = 1; + break; + } + + /* varying beta */ + for (beta_val = 0; beta_val < 3; beta_val++) { + beta_flag = 0; + switch (beta_val) { + case 0: + beta[0] = beta[1] = 0.0; + beta_flag = 1; + break; + case 1: + beta[0] = 1.0; + beta[1] = 0.0; + beta_flag = 1; + break; + } + + + /* varying extra precs */ + for (prec_val = 0; prec_val <= 2; prec_val++) { + switch (prec_val) { + case 0: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 1: + eps_int = power(2, -BITS_D); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_double)); + prec = blas_prec_double; + break; + case 2: + default: + eps_int = power(2, -BITS_E); + un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra), + (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra)); + prec = blas_prec_extra; + break; + } + + /* values near underflow, 1, or overflow */ + for (norm = -1; norm <= 1; norm++) { + + /* number of tests */ + for (i = 0; i < ntests; i++) { + + /* row or col major */ + for (order_val = 0; order_val < 2; order_val++) { + switch (order_val) { + case 0: + order_type = blas_rowmajor; + break; + case 1: + order_type = blas_colmajor; + break; + } + + /* no_trans, trans, or conj_trans */ + for (trans_val = 0; trans_val < 3; trans_val++) { + switch (trans_val) { + case 0: + trans_type = blas_no_trans; + m_i = m; + n_i = n; + break; + case 1: + trans_type = blas_trans; + m_i = n; + n_i = m; + break; + case 2: + trans_type = blas_conj_trans; + m_i = n; + n_i = m; + break; + } + + /* ku from 0 to n-1 */ + for (ku = 0; ku < n; ku++) { + if (ku == n && ku != 0) + continue; /* the purpose of doing this is + to test for ku=0 */ + + /* kl from 0 to m-1 */ + for (kl = 0; kl < m; kl++) { + if (kl == n && kl != 0) + continue; /* the purpose of doing this is + to test for kl=0 */ + + /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */ + for (lda_val = 0; lda_val < 3; lda_val++) { + switch (lda_val) { + case 0: + lda = ku + kl + 1; + break; + case 1: + lda = ku + kl + 2; + break; + case 2: + lda = 2 * (ku + kl + 1); + break; + } + + if ((order_type == blas_rowmajor && lda < n) || + (order_type == blas_colmajor && lda < m)) + continue; + + /* For the sake of speed, we throw out this case at random */ + if (xrand(seed) >= test_prob) + continue; + + /* in the trivial cases, no need to run testgen */ + if (m > 0 && n > 0) + BLAS_zgbmv2_d_d_testgen(norm, order_type, + trans_type, m, n, kl, ku, + &alpha, alpha_flag, AB, lda, + x_head_gen, x_tail_gen, + &beta, beta_flag, y_gen, + seed, head_r_true, + tail_r_true); + count++; + + /* varying incx */ + for (incx_val = -2; incx_val <= 2; incx_val++) { + if (incx_val == 0) + continue; + + /* setting incx */ + incx = incx_val; + + + dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val); + dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val); + + /* varying incy */ + for (incy_val = -2; incy_val <= 2; incy_val++) { + if (incy_val == 0) + continue; + + /* setting incy */ + incy = incy_val; + incy *= 2; + + zcopy_vector(y_gen, m_i, 1, y, incy_val); + + /* call BLAS_zgbmv2_d_d_x */ + FPU_FIX_STOP; + BLAS_zgbmv2_d_d_x(order_type, trans_type, m, n, + kl, ku, alpha, AB, lda, x_head, + x_tail, incx_val, beta, y, + incy_val, prec); + FPU_FIX_START; + + /* set y starting index */ + iy = 0; + if (incy < 0) + iy = -(m_i - 1) * incy; + + /* computing the ratio */ + for (j = 0; j < m_i; j++) { + /* copy row j of AB to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, ku, + AB, lda, temp, j); + + test_BLAS_zdot2_d_d(n_i, blas_no_conj, alpha, + beta, &y_gen[j * incy_gen], + &y[iy], + &head_r_true[j * incy_gen], + &tail_r_true[j * incy_gen], + temp, 1, x_head, x_tail, + incx_val, eps_int, un_int, + &ratios[j]); + + /* take the max ratio */ + if (j == 0) { + ratio = ratios[0]; + /* The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + } else if (!(ratios[j] <= ratio)) { + ratio = ratios[j]; + } + + iy += incy; + } + + /* Increase the number of bad ratio, if the ratio + is bigger than the threshold. + The !<= below causes NaN error to be detected. + Note that (NaN > thresh) is always false. */ + if (!(ratio <= thresh)) { + bad_ratios++; + + if ((debug == 3) && /* print only when debug is on */ + (count != old_count) && /* print if old vector is different + from the current one */ + (d_count == find_max_ratio) && + (p_count <= max_print) && + (ratio > 0.5 * ratio_max)) { + old_count = count; + printf + ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n", + fname, m, n, ntests, thresh); + + /* Print test info */ + switch (prec) { + case blas_prec_single: + printf("single "); + break; + case blas_prec_double: + printf("double "); + break; + case blas_prec_indigenous: + printf("indigenous "); + break; + case blas_prec_extra: + printf("extra "); + break; + } + switch (norm) { + case -1: + printf("near_underflow "); + break; + case 0: + printf("near_one "); + break; + case 1: + printf("near_overflow "); + break; + } + switch (order_type) { + case blas_rowmajor: + printf("row_major "); + break; + case blas_colmajor: + printf("col_major "); + break; + } + switch (trans_type) { + case blas_no_trans: + printf("no_trans "); + break; + case blas_trans: + printf("trans "); + break; + case blas_conj_trans: + printf("conj_trans "); + break; + } + + printf + ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n", + ku, kl, lda, incx, incy); + + ix = 0; + iy = 0; + if (incx < 0) + ix = -(n_i - 1) * incx; + if (incy < 0) + iy = -(m_i - 1) * incy; + + printf(" A="); + for (j = 0; j < m_i; j++) { + /* copy row j of A to temp */ + dgbmv_copy(order_type, trans_type, m, n, kl, + ku, AB, lda, temp, j); + + if (j > 0) + printf(" "); + dprint_vector(temp, n_i, 1, NULL); + } + + for (j = 0, k = 0; j < n_i || k < m_i; + j++, k++) { + if (j < n_i) { + printf(" "); + printf("%24.16e", x_head[ix]); + printf("\n"); + } + if (j < n_i) { + printf(" "); + printf("%24.16e", x_tail[ix]); + printf("\n"); + } + if (k < m_i) { + printf(" "); + printf("(%24.16e, %24.16e)", + y_gen[k * incy_gen], + y_gen[k * incy_gen + 1]); + printf("\n"); + printf(" "); + printf("y_final[%d] = ", iy); + printf("(%24.16e, %24.16e)", y[iy], + y[iy + 1]); + printf("\n"); + } + ix += incx; + iy += incy; + } + + printf(" "); + printf("alpha = "); + printf("(%24.16e, %24.16e)", alpha[0], + alpha[1]); + printf("\n "); + printf("beta = "); + printf("(%24.16e, %24.16e)", beta[0], + beta[1]); + printf("\n"); + for (j = 0; j < m_i; j++) { + printf(" "); + printf + ("([%24.16e %24.16e], [%24.16e %24.16e])", + head_r_true[j * incy_gen], + tail_r_true[j * incy_gen], + head_r_true[j * incy_gen + 1], + tail_r_true[j * incy_gen + 1]); + printf(", ratio[%d]=%.4e\n", j, ratios[j]); + } + + printf(" ratio=%.4e\n", ratio); + p_count++; + } + if (bad_ratios >= MAX_BAD_TESTS) { + printf("\ntoo many failures, exiting...."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) { + printf("\nFlagrant ratio error, exiting..."); + printf("\nTesting and compilation"); + printf(" are incomplete\n\n"); + goto end; + } + } + if (d_count == 0) { + if (ratio > ratio_max) + ratio_max = ratio; + + if (ratio != 0.0 && ratio < ratio_min) + ratio_min = ratio; + tot_tests++; + } + } /* incy */ + } /* incx */ + } /* lda */ + } /* kl */ + } /* ku */ + } /* trans */ + } /* order */ + } /* tests */ + } /* norm */ + } /* prec */ + } /* beta */ + } /* alpha */ + } /* debug */ + + if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) { + printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname, + m, n, ntests, thresh); + printf + (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n", + bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests), + ratio_min, ratio_max); + } + +end: + blas_free(x_head); + blas_free(x_tail); + blas_free(y); + blas_free(x_head_gen); + blas_free(x_tail_gen); + blas_free(y_gen); + blas_free(temp); + blas_free(AB); + blas_free(head_r_true); + blas_free(tail_r_true); + blas_free(ratios); + + FPU_FIX_STOP; + + *min_ratio = ratio_min; + *num_bad_ratio = bad_ratios; + *num_tests = tot_tests; + return ratio_max; +} + +#define NUMPAIRS 12 + +int main(int argc, char **argv) +{ + int nsizes, ntests, debug; + double thresh, test_prob; + double total_min_ratio, total_max_ratio; + int total_bad_ratios; + int seed, num_bad_ratio, num_tests; + int total_tests, nr_failed_routines = 0, nr_routines = 0; + double min_ratio, max_ratio; + const char *base_routine = "gbmv2"; + char *fname; + int n; + + int m, i; + int mn_pairs[NUMPAIRS][2] = + { {0, 0}, {1, 0}, {0, 1}, {1, 2}, {2, 1}, {1, 3}, + {3, 1}, {2, 3}, {4, 3}, {2, 4}, {6, 6}, {10, 8} + }; + + if (argc != 6) { + printf("Usage:\n"); + printf("do_test_gbmv2 <nsizes> <ntests> <thresh> <debug> <test_prob>\n"); + printf(" <nsizes>: number of sizes to be run.\n"); + printf + (" <ntests>: the number of tests performed for each set of attributes\n"); + printf + (" <thresh>: to catch bad ratios if it is greater than <thresh>\n"); + printf(" <debug>: 0, 1, 2, or 3; \n"); + printf(" if 0, no printing \n"); + printf(" if 1, print error summary only if tests fail\n"); + printf(" if 2, print error summary for each n\n"); + printf(" if 3, print complete info each test fails \n"); + printf("<test_prob>: probability of preforming a given \n"); + printf(" test case: 0.0 does no tests, 1.0 does all tests\n"); + return -1; + } else { + nsizes = atoi(argv[1]); + ntests = atoi(argv[2]); + thresh = atof(argv[3]); + debug = atoi(argv[4]); + test_prob = atof(argv[5]); + } + + seed = 1999; + + if (nsizes < 0 || ntests < 0 || debug < 0 || debug > 3) + BLAS_error("Testing gbmv2", 0, 0, NULL); + + printf("Testing %s...\n", base_routine); + printf("INPUT: nsizes = %d, ntests = %d, thresh = %4.2f, debug = %d\n\n", + nsizes, ntests, thresh, debug); + + + + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_dgbmv2_d_s"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_dgbmv2_d_s(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_dgbmv2_s_d"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_dgbmv2_s_d(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_dgbmv2_s_s"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_dgbmv2_s_s(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_z_c"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_z_c(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_c_z"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_c_z(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_c_c"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_c_c(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_cgbmv2_c_s"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_cgbmv2_c_s(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_cgbmv2_s_c"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_cgbmv2_s_c(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_cgbmv2_s_s"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_cgbmv2_s_s(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_z_d"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_z_d(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_d_z"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_d_z(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_d_d"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_d_d(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_sgbmv2_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_sgbmv2_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_dgbmv2_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_dgbmv2_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_cgbmv2_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_cgbmv2_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_dgbmv2_d_s_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_dgbmv2_d_s_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_dgbmv2_s_d_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_dgbmv2_s_d_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_dgbmv2_s_s_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_dgbmv2_s_s_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_z_c_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_z_c_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_c_z_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_c_z_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_c_c_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_c_c_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_cgbmv2_c_s_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_cgbmv2_c_s_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_cgbmv2_s_c_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_cgbmv2_s_c_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_cgbmv2_s_s_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_cgbmv2_s_s_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_z_d_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_z_d_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_d_z_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_d_z_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + min_ratio = 1e308; + max_ratio = 0.0; + total_bad_ratios = 0; + total_tests = 0; + fname = "BLAS_zgbmv2_d_d_x"; + printf("Testing %s...\n", fname); + for (i = 0; i < nsizes; i++) { + m = mn_pairs[i][0]; + n = mn_pairs[i][1]; + total_max_ratio = + do_test_zgbmv2_d_d_x(m, n, 1, &seed, thresh, debug, test_prob, + &total_min_ratio, &num_bad_ratio, &num_tests); + if (total_max_ratio > max_ratio) + max_ratio = total_max_ratio; + + if (total_min_ratio != 0.0 && total_min_ratio < min_ratio) + min_ratio = total_min_ratio; + + total_bad_ratios += num_bad_ratio; + total_tests += num_tests; + } + + if (total_bad_ratios == 0) + printf("PASS> "); + else { + nr_failed_routines++; + printf("FAIL> "); + } + nr_routines++; + + if (min_ratio == 1e308) + min_ratio = 0.0; + + printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname, + total_bad_ratios, total_tests, max_ratio); + + + printf("\n"); + if (nr_failed_routines) + printf("FAILED "); + else + printf("PASSED "); + printf("%-10s: FAIL/TOTAL = %d/%d\n", + base_routine, nr_failed_routines, nr_routines); + + return 0; +} |