summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--XBLAS/Makefile29
-rw-r--r--XBLAS/src/blas_extended_proto.h142
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_c_s-f2c.c18
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_c_s.c303
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x.c1516
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_s_c-f2c.c18
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_s_c.c281
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x.c1304
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_s_s-f2c.c18
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_s_s.c269
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x.c946
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_cgbmv2_x.c1688
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_d_s-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_d_s.c254
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x.c620
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_s_d-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_s_d.c254
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c620
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_s_s-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_s_s.c254
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x.c584
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_dgbmv2_x.c614
-rw-r--r--XBLAS/src/gbmv2/BLAS_sgbmv2_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_sgbmv2_x.c727
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_c_c-f2c.c18
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_c_c.c314
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x.c1560
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_c_z-f2c.c18
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_c_z.c314
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x.c1956
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_d_d-f2c.c18
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_d_d.c270
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x.c896
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_d_z-f2c.c18
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_d_z.c280
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x.c1274
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_x.c1936
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_z_c-f2c.c18
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_z_c.c314
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x.c1956
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_z_d-f2c.c18
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_z_d.c302
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x-f2c.c19
-rw-r--r--XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x.c1532
-rw-r--r--XBLAS/src/gbmv2/Makefile80
-rw-r--r--XBLAS/testing/blas_extended_test.h111
-rw-r--r--XBLAS/testing/test-gbmv2/BLAS_gbmv2_testgen.c2315
-rw-r--r--XBLAS/testing/test-gbmv2/Makefile43
-rw-r--r--XBLAS/testing/test-gbmv2/do_test_gbmv2.c18419
63 files changed, 44789 insertions, 11 deletions
diff --git a/XBLAS/Makefile b/XBLAS/Makefile
index c43bc1b3..5a006bfd 100644
--- a/XBLAS/Makefile
+++ b/XBLAS/Makefile
@@ -11,10 +11,10 @@
include make.conf
include $(MAKEINC)
-LIB = dot-lib sum-lib axpby-lib waxpby-lib gemv-lib ge_sum_mv-lib gbmv-lib symv-lib spmv-lib sbmv-lib hemv-lib hpmv-lib hbmv-lib trmv-lib tpmv-lib trsv-lib tbsv-lib gemm-lib symm-lib hemm-lib gemv2-lib symv2-lib hemv2-lib
-TESTS = dot-test sum-test axpby-test waxpby-test gemv-test ge_sum_mv-test gbmv-test symv-test spmv-test sbmv-test hemv-test hpmv-test hbmv-test trmv-test tpmv-test trsv-test tbsv-test gemm-test symm-test hemm-test gemv2-test symv2-test hemv2-test
-TEST_LIB = dot-test-lib sum-test-lib axpby-test-lib waxpby-test-lib gemv-test-lib ge_sum_mv-test-lib gbmv-test-lib symv-test-lib spmv-test-lib sbmv-test-lib hemv-test-lib hpmv-test-lib hbmv-test-lib trmv-test-lib tpmv-test-lib trsv-test-lib tbsv-test-lib gemm-test-lib symm-test-lib hemm-test-lib gemv2-test-lib symv2-test-lib hemv2-test-lib
-CLEAN = dot-clean sum-clean axpby-clean waxpby-clean gemv-clean ge_sum_mv-clean gbmv-clean symv-clean spmv-clean sbmv-clean hemv-clean hpmv-clean hbmv-clean trmv-clean tpmv-clean trsv-clean tbsv-clean gemm-clean symm-clean hemm-clean gemv2-clean symv2-clean hemv2-clean dot2-clean
+LIB = dot-lib sum-lib axpby-lib waxpby-lib gemv-lib ge_sum_mv-lib gbmv-lib symv-lib spmv-lib sbmv-lib hemv-lib hpmv-lib hbmv-lib trmv-lib tpmv-lib trsv-lib tbsv-lib gemm-lib symm-lib hemm-lib gemv2-lib symv2-lib hemv2-lib gbmv2-lib
+TESTS = dot-test sum-test axpby-test waxpby-test gemv-test ge_sum_mv-test gbmv-test symv-test spmv-test sbmv-test hemv-test hpmv-test hbmv-test trmv-test tpmv-test trsv-test tbsv-test gemm-test symm-test hemm-test gemv2-test symv2-test hemv2-test gbmv2-test
+TEST_LIB = dot-test-lib sum-test-lib axpby-test-lib waxpby-test-lib gemv-test-lib ge_sum_mv-test-lib gbmv-test-lib symv-test-lib spmv-test-lib sbmv-test-lib hemv-test-lib hpmv-test-lib hbmv-test-lib trmv-test-lib tpmv-test-lib trsv-test-lib tbsv-test-lib gemm-test-lib symm-test-lib hemm-test-lib gemv2-test-lib symv2-test-lib hemv2-test-lib gbmv2-test-lib
+CLEAN = dot-clean sum-clean axpby-clean waxpby-clean gemv-clean ge_sum_mv-clean gbmv-clean symv-clean spmv-clean sbmv-clean hemv-clean hpmv-clean hbmv-clean trmv-clean tpmv-clean trsv-clean tbsv-clean gemm-clean symm-clean hemm-clean gemv2-clean symv2-clean hemv2-clean gbmv2-clean dot2-clean
SRC_DIR = src
TEST_DIR = testing
@@ -52,6 +52,7 @@ tests: test-lib $(TESTS)
cat $(TEST_DIR)/test-gemv2/gemv2.results >> testall.result
cat $(TEST_DIR)/test-symv2/symv2.results >> testall.result
cat $(TEST_DIR)/test-hemv2/hemv2.results >> testall.result
+ cat $(TEST_DIR)/test-gbmv2/gbmv2.results >> testall.result
grep 'FAIL/TOTAL' testall.result >testall.summary
cat testall.summary
@@ -86,6 +87,7 @@ lib: $(LIB)
cd $(SRC_DIR)/gemv2 && $(MAKE) lib
cd $(SRC_DIR)/symv2 && $(MAKE) lib
cd $(SRC_DIR)/hemv2 && $(MAKE) lib
+ cd $(SRC_DIR)/gbmv2 && $(MAKE) lib
# custom test-dot2 stuff
@@ -536,6 +538,25 @@ hemv2-clean:
cd $(TEST_DIR)/test-hemv2 && $(MAKE) clean
+# gbmv2 stuff
+
+gbmv2: gbmv2-test
+
+
+gbmv2-lib: common-lib
+ cd $(SRC_DIR)/gbmv2 && $(MAKE)
+
+gbmv2-test-lib: lib common-test-lib
+ cd $(TEST_DIR)/test-gbmv2 && $(MAKE) do_test_gbmv2
+
+gbmv2-test: gbmv2-test-lib
+ cd $(TEST_DIR)/test-gbmv2 && $(MAKE)
+
+gbmv2-clean:
+ cd $(SRC_DIR)/gbmv2 && $(MAKE) clean
+ cd $(TEST_DIR)/test-gbmv2 && $(MAKE) clean
+
+
# Test library dependencies
sum-test-lib: dot-test-lib
diff --git a/XBLAS/src/blas_extended_proto.h b/XBLAS/src/blas_extended_proto.h
index ba1497a0..3bdc3fde 100644
--- a/XBLAS/src/blas_extended_proto.h
+++ b/XBLAS/src/blas_extended_proto.h
@@ -1968,6 +1968,148 @@ void BLAS_zhemv2_z_d_x(enum blas_order_type order, enum blas_uplo_type uplo,
enum blas_prec_type prec);
+void BLAS_dgbmv2_d_s(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const double *a, int lda, const float *head_x,
+ const float *tail_x, int incx, double beta,
+ double *y, int incy);
+void BLAS_dgbmv2_s_d(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const double *head_x,
+ const double *tail_x, int incx, double beta,
+ double *y, int incy);
+void BLAS_dgbmv2_s_s(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, double beta,
+ double *y, int incy);
+void BLAS_zgbmv2_z_c(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy);
+void BLAS_zgbmv2_c_z(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy);
+void BLAS_zgbmv2_c_c(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy);
+void BLAS_cgbmv2_c_s(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const float *head_x,
+ const float *tail_x, int incx, const void *beta,
+ void *y, int incy);
+void BLAS_cgbmv2_s_c(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const float *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy);
+void BLAS_cgbmv2_s_s(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, const void *beta,
+ void *y, int incy);
+void BLAS_zgbmv2_z_d(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const double *head_x,
+ const double *tail_x, int incx, const void *beta,
+ void *y, int incy);
+void BLAS_zgbmv2_d_z(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const double *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy);
+void BLAS_zgbmv2_d_d(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const double *a, int lda, const double *head_x,
+ const double *tail_x, int incx, const void *beta,
+ void *y, int incy);
+void BLAS_sgbmv2_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, float alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, float beta,
+ float *y, int incy, enum blas_prec_type prec);
+void BLAS_dgbmv2_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const double *a, int lda, const double *head_x,
+ const double *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec);
+void BLAS_cgbmv2_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+void BLAS_zgbmv2_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+void BLAS_dgbmv2_d_s_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const double *a, int lda, const float *head_x,
+ const float *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec);
+void BLAS_dgbmv2_s_d_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const double *head_x,
+ const double *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec);
+void BLAS_dgbmv2_s_s_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec);
+void BLAS_zgbmv2_z_c_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+void BLAS_zgbmv2_c_z_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+void BLAS_zgbmv2_c_c_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+void BLAS_cgbmv2_c_s_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const float *head_x,
+ const float *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+void BLAS_cgbmv2_s_c_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const float *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+void BLAS_cgbmv2_s_s_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+void BLAS_zgbmv2_z_d_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const double *head_x,
+ const double *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+void BLAS_zgbmv2_d_z_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const double *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+void BLAS_zgbmv2_d_d_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const double *a, int lda, const double *head_x,
+ const double *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+
+
int BLAS_fpinfo_x(enum blas_cmach_type cmach, enum blas_prec_type prec);
void BLAS_error(const char *rname, int iflag, int ival, char *form, ...);
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s-f2c.c
new file mode 100644
index 00000000..5f4f728a
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s-f2c.c
@@ -0,0 +1,18 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_cgbmv2_c_s(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const float *head_x,
+ const float *tail_x, int incx, const void *beta,
+ void *y, int incy);
+
+
+extern void FC_FUNC_(blas_cgbmv2_c_s, BLAS_CGBMV2_C_S)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const void *a, int *lda, const float *head_x, const float *tail_x,
+ int *incx, const void *beta, void *y, int *incy) {
+ BLAS_cgbmv2_c_s(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s.c
new file mode 100644
index 00000000..3787bdba
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s.c
@@ -0,0 +1,303 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_cgbmv2_c_s(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const float *head_x,
+ const float *tail_x, int incx, const void *beta,
+ void *y, int incy)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) void*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) float*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_cgbmv2_c_s";
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = (float *) a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ float tmp1[2];
+ float tmp2[2];
+ float tmp3[2];
+ float tmp4[2];
+ float result[2];
+ float sum1[2];
+ float sum2[2];
+ float prod[2];
+ float a_elem[2];
+ float x_elem;
+ float y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem = tail_x_i[jx];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem = tail_x_i[jx];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] = sum1[0] * alpha_i[0] - sum1[1] * alpha_i[1];
+ tmp1[1] = sum1[0] * alpha_i[1] + sum1[1] * alpha_i[0];
+ }
+
+ {
+ tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
+ tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
+ }
+
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1];
+ tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0];
+ }
+
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+} /* end BLAS_cgbmv2_c_s */
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x-f2c.c
new file mode 100644
index 00000000..9f95e1ff
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_cgbmv2_c_s_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const float *head_x,
+ const float *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_cgbmv2_c_s_x, BLAS_CGBMV2_C_S_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const void *a, int *lda, const float *head_x, const float *tail_x,
+ int *incx, const void *beta, void *y, int *incy, int *prec) {
+ BLAS_cgbmv2_c_s_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y,
+ *incy, (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x.c
new file mode 100644
index 00000000..d4411041
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_c_s_x.c
@@ -0,0 +1,1516 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_cgbmv2_c_s_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const float *head_x,
+ const float *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) void*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) float*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_cgbmv2_c_s_x";
+
+ switch (prec) {
+ case blas_prec_single:{
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = (float *) a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ float tmp1[2];
+ float tmp2[2];
+ float tmp3[2];
+ float tmp4[2];
+ float result[2];
+ float sum1[2];
+ float sum2[2];
+ float prod[2];
+ float a_elem[2];
+ float x_elem;
+ float y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem = tail_x_i[jx];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem = tail_x_i[jx];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] = sum1[0] * alpha_i[0] - sum1[1] * alpha_i[1];
+ tmp1[1] = sum1[0] * alpha_i[1] + sum1[1] * alpha_i[0];
+ }
+
+ {
+ tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
+ tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
+ }
+
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1];
+ tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0];
+ }
+
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+ break;
+ }
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = (float *) a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ float result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ float a_elem[2];
+ float x_elem;
+ float y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] = (double) a_elem[0] * x_elem;
+ prod[1] = (double) a_elem[1] * x_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem = tail_x_i[jx];
+ {
+ prod[0] = (double) a_elem[0] * x_elem;
+ prod[1] = (double) a_elem[1] * x_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] = (double) a_elem[0] * x_elem;
+ prod[1] = (double) a_elem[1] * x_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem = tail_x_i[jx];
+ {
+ prod[0] = (double) a_elem[0] * x_elem;
+ prod[1] = (double) a_elem[1] * x_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] =
+ (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] =
+ (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] =
+ (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] =
+ (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = (float *) a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ double head_tmp1[2], tail_tmp1[2];
+ double head_tmp2[2], tail_tmp2[2];
+ double head_tmp3[2], tail_tmp3[2];
+ double head_tmp4[2], tail_tmp4[2];
+ float result[2];
+ double head_sum1[2], tail_sum1[2];
+ double head_sum2[2], tail_sum2[2];
+ double head_prod[2], tail_prod[2];
+ float a_elem[2];
+ float x_elem;
+ float y_elem[2];
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0;
+ head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ head_prod[0] = (double) a_elem[0] * x_elem;
+ tail_prod[0] = 0.0;
+ head_prod[1] = (double) a_elem[1] * x_elem;
+ tail_prod[1] = 0.0;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem = tail_x_i[jx];
+ {
+ head_prod[0] = (double) a_elem[0] * x_elem;
+ tail_prod[0] = 0.0;
+ head_prod[1] = (double) a_elem[1] * x_elem;
+ tail_prod[1] = 0.0;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ head_prod[0] = (double) a_elem[0] * x_elem;
+ tail_prod[0] = 0.0;
+ head_prod[1] = (double) a_elem[1] * x_elem;
+ tail_prod[1] = 0.0;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem = tail_x_i[jx];
+ {
+ head_prod[0] = (double) a_elem[0] * x_elem;
+ tail_prod[0] = 0.0;
+ head_prod[1] = (double) a_elem[1] * x_elem;
+ tail_prod[1] = 0.0;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ double cd[2];
+ cd[0] = (double) alpha_i[0];
+ cd[1] = (double) alpha_i[1];
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum1[0];
+ tail_a0 = tail_sum1[0];
+ head_a1 = head_sum1[1];
+ tail_a1 = tail_sum1[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ c11 = head_a0 * cd[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * cd[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ c11 = head_a1 * cd[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * cd[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[0] = head_t1;
+ tail_tmp1[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ c11 = head_a1 * cd[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * cd[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ c11 = head_a0 * cd[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * cd[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[1] = head_t1;
+ tail_tmp1[1] = tail_t1;
+ }
+
+ }
+ {
+ double cd[2];
+ cd[0] = (double) alpha_i[0];
+ cd[1] = (double) alpha_i[1];
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum2[0];
+ tail_a0 = tail_sum2[0];
+ head_a1 = head_sum2[1];
+ tail_a1 = tail_sum2[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ c11 = head_a0 * cd[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * cd[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ c11 = head_a1 * cd[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * cd[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[0] = head_t1;
+ tail_tmp2[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ c11 = head_a1 * cd[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * cd[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ c11 = head_a0 * cd[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * cd[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[1] = head_t1;
+ tail_tmp2[1] = tail_t1;
+ }
+
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp1[0];
+ tail_a = tail_tmp1[0];
+ head_b = head_tmp2[0];
+ tail_b = tail_tmp2[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[0] = head_t;
+ tail_tmp3[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_tmp1[1];
+ tail_a = tail_tmp1[1];
+ head_b = head_tmp2[1];
+ tail_b = tail_tmp2[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[1] = head_t;
+ tail_tmp3[1] = tail_t;
+ }
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ double head_e1, tail_e1;
+ double d1;
+ double d2;
+ /* Real part */
+ d1 = (double) beta_i[0] * y_elem[0];
+ d2 = (double) -beta_i[1] * y_elem[1];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_tmp4[0] = head_e1;
+ tail_tmp4[0] = tail_e1;
+ /* imaginary part */
+ d1 = (double) beta_i[0] * y_elem[1];
+ d2 = (double) beta_i[1] * y_elem[0];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_tmp4[1] = head_e1;
+ tail_tmp4[1] = tail_e1;
+ }
+ {
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp4[0];
+ tail_a = tail_tmp4[0];
+ head_b = head_tmp3[0];
+ tail_b = tail_tmp3[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[0] = t1 + t2;
+ }
+ /* Imaginary part */
+ head_a = head_tmp4[1];
+ tail_a = tail_tmp4[1];
+ head_b = head_tmp3[1];
+ tail_b = tail_tmp3[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[1] = t1 + t2;
+ }
+ }
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_cgbmv2_c_s_x */
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c-f2c.c
new file mode 100644
index 00000000..45b6160a
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c-f2c.c
@@ -0,0 +1,18 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_cgbmv2_s_c(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const float *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy);
+
+
+extern void FC_FUNC_(blas_cgbmv2_s_c, BLAS_CGBMV2_S_C)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const float *a, int *lda, const void *head_x, const void *tail_x,
+ int *incx, const void *beta, void *y, int *incy) {
+ BLAS_cgbmv2_s_c(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c.c
new file mode 100644
index 00000000..f539a878
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c.c
@@ -0,0 +1,281 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_cgbmv2_s_c(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const float *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) float*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) void*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_cgbmv2_s_c";
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ float tmp1[2];
+ float tmp2[2];
+ float tmp3[2];
+ float tmp4[2];
+ float result[2];
+ float sum1[2];
+ float sum2[2];
+ float prod[2];
+ float a_elem;
+ float x_elem[2];
+ float y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem = a_i[aij];
+ {
+ prod[0] = x_elem[0] * a_elem;
+ prod[1] = x_elem[1] * a_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] = x_elem[0] * a_elem;
+ prod[1] = x_elem[1] * a_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ tmp1[0] = sum1[0] * alpha_i[0] - sum1[1] * alpha_i[1];
+ tmp1[1] = sum1[0] * alpha_i[1] + sum1[1] * alpha_i[0];
+ }
+
+ {
+ tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
+ tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
+ }
+
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1];
+ tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0];
+ }
+
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+} /* end BLAS_cgbmv2_s_c */
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x-f2c.c
new file mode 100644
index 00000000..7d484f5a
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_cgbmv2_s_c_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const float *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_cgbmv2_s_c_x, BLAS_CGBMV2_S_C_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const float *a, int *lda, const void *head_x, const void *tail_x,
+ int *incx, const void *beta, void *y, int *incy, int *prec) {
+ BLAS_cgbmv2_s_c_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y,
+ *incy, (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x.c
new file mode 100644
index 00000000..4a570797
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_c_x.c
@@ -0,0 +1,1304 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_cgbmv2_s_c_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const float *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) float*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) void*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_cgbmv2_s_c_x";
+
+ switch (prec) {
+ case blas_prec_single:{
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ float tmp1[2];
+ float tmp2[2];
+ float tmp3[2];
+ float tmp4[2];
+ float result[2];
+ float sum1[2];
+ float sum2[2];
+ float prod[2];
+ float a_elem;
+ float x_elem[2];
+ float y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem = a_i[aij];
+ {
+ prod[0] = x_elem[0] * a_elem;
+ prod[1] = x_elem[1] * a_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] = x_elem[0] * a_elem;
+ prod[1] = x_elem[1] * a_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ tmp1[0] = sum1[0] * alpha_i[0] - sum1[1] * alpha_i[1];
+ tmp1[1] = sum1[0] * alpha_i[1] + sum1[1] * alpha_i[0];
+ }
+
+ {
+ tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
+ tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
+ }
+
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1];
+ tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0];
+ }
+
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+ break;
+ }
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ float result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ float a_elem;
+ float x_elem[2];
+ float y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem = a_i[aij];
+ {
+ prod[0] = (double) x_elem[0] * a_elem;
+ prod[1] = (double) x_elem[1] * a_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] = (double) x_elem[0] * a_elem;
+ prod[1] = (double) x_elem[1] * a_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ tmp1[0] =
+ (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] =
+ (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] =
+ (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] =
+ (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ double head_tmp1[2], tail_tmp1[2];
+ double head_tmp2[2], tail_tmp2[2];
+ double head_tmp3[2], tail_tmp3[2];
+ double head_tmp4[2], tail_tmp4[2];
+ float result[2];
+ double head_sum1[2], tail_sum1[2];
+ double head_sum2[2], tail_sum2[2];
+ double head_prod[2], tail_prod[2];
+ float a_elem;
+ float x_elem[2];
+ float y_elem[2];
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0;
+ head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem = a_i[aij];
+ {
+ head_prod[0] = (double) x_elem[0] * a_elem;
+ tail_prod[0] = 0.0;
+ head_prod[1] = (double) x_elem[1] * a_elem;
+ tail_prod[1] = 0.0;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ head_prod[0] = (double) x_elem[0] * a_elem;
+ tail_prod[0] = 0.0;
+ head_prod[1] = (double) x_elem[1] * a_elem;
+ tail_prod[1] = 0.0;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ double cd[2];
+ cd[0] = (double) alpha_i[0];
+ cd[1] = (double) alpha_i[1];
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum1[0];
+ tail_a0 = tail_sum1[0];
+ head_a1 = head_sum1[1];
+ tail_a1 = tail_sum1[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ c11 = head_a0 * cd[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * cd[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ c11 = head_a1 * cd[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * cd[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[0] = head_t1;
+ tail_tmp1[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ c11 = head_a1 * cd[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * cd[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ c11 = head_a0 * cd[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * cd[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[1] = head_t1;
+ tail_tmp1[1] = tail_t1;
+ }
+
+ }
+ {
+ double cd[2];
+ cd[0] = (double) alpha_i[0];
+ cd[1] = (double) alpha_i[1];
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum2[0];
+ tail_a0 = tail_sum2[0];
+ head_a1 = head_sum2[1];
+ tail_a1 = tail_sum2[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ c11 = head_a0 * cd[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * cd[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ c11 = head_a1 * cd[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * cd[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[0] = head_t1;
+ tail_tmp2[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ c11 = head_a1 * cd[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * cd[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ c11 = head_a0 * cd[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * cd[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[1] = head_t1;
+ tail_tmp2[1] = tail_t1;
+ }
+
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp1[0];
+ tail_a = tail_tmp1[0];
+ head_b = head_tmp2[0];
+ tail_b = tail_tmp2[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[0] = head_t;
+ tail_tmp3[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_tmp1[1];
+ tail_a = tail_tmp1[1];
+ head_b = head_tmp2[1];
+ tail_b = tail_tmp2[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[1] = head_t;
+ tail_tmp3[1] = tail_t;
+ }
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ double head_e1, tail_e1;
+ double d1;
+ double d2;
+ /* Real part */
+ d1 = (double) beta_i[0] * y_elem[0];
+ d2 = (double) -beta_i[1] * y_elem[1];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_tmp4[0] = head_e1;
+ tail_tmp4[0] = tail_e1;
+ /* imaginary part */
+ d1 = (double) beta_i[0] * y_elem[1];
+ d2 = (double) beta_i[1] * y_elem[0];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_tmp4[1] = head_e1;
+ tail_tmp4[1] = tail_e1;
+ }
+ {
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp4[0];
+ tail_a = tail_tmp4[0];
+ head_b = head_tmp3[0];
+ tail_b = tail_tmp3[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[0] = t1 + t2;
+ }
+ /* Imaginary part */
+ head_a = head_tmp4[1];
+ tail_a = tail_tmp4[1];
+ head_b = head_tmp3[1];
+ tail_b = tail_tmp3[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[1] = t1 + t2;
+ }
+ }
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_cgbmv2_s_c_x */
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s-f2c.c
new file mode 100644
index 00000000..fddb2b00
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s-f2c.c
@@ -0,0 +1,18 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_cgbmv2_s_s(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, const void *beta,
+ void *y, int incy);
+
+
+extern void FC_FUNC_(blas_cgbmv2_s_s, BLAS_CGBMV2_S_S)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const float *a, int *lda, const float *head_x, const float *tail_x,
+ int *incx, const void *beta, void *y, int *incy) {
+ BLAS_cgbmv2_s_s(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s.c
new file mode 100644
index 00000000..8e326907
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s.c
@@ -0,0 +1,269 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_cgbmv2_s_s(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, const void *beta,
+ void *y, int incy)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) float*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) float*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_cgbmv2_s_s";
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ float tmp1[2];
+ float tmp2[2];
+ float tmp3[2];
+ float tmp4[2];
+ float result[2];
+ float sum1;
+ float sum2;
+ float prod;
+ float a_elem;
+ float x_elem;
+ float y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = x_elem * a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = x_elem * a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ tmp1[0] = alpha_i[0] * sum1;
+ tmp1[1] = alpha_i[1] * sum1;
+ }
+ {
+ tmp2[0] = alpha_i[0] * sum2;
+ tmp2[1] = alpha_i[1] * sum2;
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1];
+ tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0];
+ }
+
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+} /* end BLAS_cgbmv2_s_s */
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x-f2c.c
new file mode 100644
index 00000000..f05811d8
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_cgbmv2_s_s_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_cgbmv2_s_s_x, BLAS_CGBMV2_S_S_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const float *a, int *lda, const float *head_x, const float *tail_x,
+ int *incx, const void *beta, void *y, int *incy, int *prec) {
+ BLAS_cgbmv2_s_s_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y,
+ *incy, (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x.c
new file mode 100644
index 00000000..0c7c8770
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_s_s_x.c
@@ -0,0 +1,946 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_cgbmv2_s_s_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) float*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) float*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_cgbmv2_s_s_x";
+
+ switch (prec) {
+ case blas_prec_single:{
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ float tmp1[2];
+ float tmp2[2];
+ float tmp3[2];
+ float tmp4[2];
+ float result[2];
+ float sum1;
+ float sum2;
+ float prod;
+ float a_elem;
+ float x_elem;
+ float y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = x_elem * a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = x_elem * a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ tmp1[0] = alpha_i[0] * sum1;
+ tmp1[1] = alpha_i[1] * sum1;
+ }
+ {
+ tmp2[0] = alpha_i[0] * sum2;
+ tmp2[1] = alpha_i[1] * sum2;
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1];
+ tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0];
+ }
+
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+ break;
+ }
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ float result[2];
+ double sum1;
+ double sum2;
+ double prod;
+ float a_elem;
+ float x_elem;
+ float y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = (double) x_elem *a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = (double) x_elem *a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ tmp1[0] = alpha_i[0] * sum1;
+ tmp1[1] = alpha_i[1] * sum1;
+ }
+ {
+ tmp2[0] = alpha_i[0] * sum2;
+ tmp2[1] = alpha_i[1] * sum2;
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ double head_tmp1[2], tail_tmp1[2];
+ double head_tmp2[2], tail_tmp2[2];
+ double head_tmp3[2], tail_tmp3[2];
+ double head_tmp4[2], tail_tmp4[2];
+ float result[2];
+ double head_sum1, tail_sum1;
+ double head_sum2, tail_sum2;
+ double head_prod, tail_prod;
+ float a_elem;
+ float x_elem;
+ float y_elem[2];
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1 = tail_sum1 = 0.0;
+ head_sum2 = tail_sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ head_prod = (double) x_elem *a_elem;
+ tail_prod = 0.0;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum1 + head_prod;
+ bv = s1 - head_sum1;
+ s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum1 + tail_prod;
+ bv = t1 - tail_sum1;
+ t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum1 = t1 + t2;
+ tail_sum1 = t2 - (head_sum1 - t1);
+ }
+ x_elem = tail_x_i[jx];
+ head_prod = (double) x_elem *a_elem;
+ tail_prod = 0.0;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum2 + head_prod;
+ bv = s1 - head_sum2;
+ s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum2 + tail_prod;
+ bv = t1 - tail_sum2;
+ t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum2 = t1 + t2;
+ tail_sum2 = t2 - (head_sum2 - t1);
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ double head_e1, tail_e1;
+ double dt;
+ dt = (double) alpha_i[0];
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum1 * split;
+ a11 = con - head_sum1;
+ a11 = con - a11;
+ a21 = head_sum1 - a11;
+ con = dt * split;
+ b1 = con - dt;
+ b1 = con - b1;
+ b2 = dt - b1;
+
+ c11 = head_sum1 * dt;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum1 * dt;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_tmp1[0] = head_e1;
+ tail_tmp1[0] = tail_e1;
+ dt = (double) alpha_i[1];
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum1 * split;
+ a11 = con - head_sum1;
+ a11 = con - a11;
+ a21 = head_sum1 - a11;
+ con = dt * split;
+ b1 = con - dt;
+ b1 = con - b1;
+ b2 = dt - b1;
+
+ c11 = head_sum1 * dt;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum1 * dt;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_tmp1[1] = head_e1;
+ tail_tmp1[1] = tail_e1;
+ }
+ {
+ double head_e1, tail_e1;
+ double dt;
+ dt = (double) alpha_i[0];
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum2 * split;
+ a11 = con - head_sum2;
+ a11 = con - a11;
+ a21 = head_sum2 - a11;
+ con = dt * split;
+ b1 = con - dt;
+ b1 = con - b1;
+ b2 = dt - b1;
+
+ c11 = head_sum2 * dt;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum2 * dt;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_tmp2[0] = head_e1;
+ tail_tmp2[0] = tail_e1;
+ dt = (double) alpha_i[1];
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum2 * split;
+ a11 = con - head_sum2;
+ a11 = con - a11;
+ a21 = head_sum2 - a11;
+ con = dt * split;
+ b1 = con - dt;
+ b1 = con - b1;
+ b2 = dt - b1;
+
+ c11 = head_sum2 * dt;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum2 * dt;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_tmp2[1] = head_e1;
+ tail_tmp2[1] = tail_e1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp1[0];
+ tail_a = tail_tmp1[0];
+ head_b = head_tmp2[0];
+ tail_b = tail_tmp2[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[0] = head_t;
+ tail_tmp3[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_tmp1[1];
+ tail_a = tail_tmp1[1];
+ head_b = head_tmp2[1];
+ tail_b = tail_tmp2[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[1] = head_t;
+ tail_tmp3[1] = tail_t;
+ }
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ double head_e1, tail_e1;
+ double d1;
+ double d2;
+ /* Real part */
+ d1 = (double) beta_i[0] * y_elem[0];
+ d2 = (double) -beta_i[1] * y_elem[1];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_tmp4[0] = head_e1;
+ tail_tmp4[0] = tail_e1;
+ /* imaginary part */
+ d1 = (double) beta_i[0] * y_elem[1];
+ d2 = (double) beta_i[1] * y_elem[0];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_tmp4[1] = head_e1;
+ tail_tmp4[1] = tail_e1;
+ }
+ {
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp4[0];
+ tail_a = tail_tmp4[0];
+ head_b = head_tmp3[0];
+ tail_b = tail_tmp3[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[0] = t1 + t2;
+ }
+ /* Imaginary part */
+ head_a = head_tmp4[1];
+ tail_a = tail_tmp4[1];
+ head_b = head_tmp3[1];
+ tail_b = tail_tmp3[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[1] = t1 + t2;
+ }
+ }
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_cgbmv2_s_s_x */
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_x-f2c.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_x-f2c.c
new file mode 100644
index 00000000..4f9fa6a1
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_cgbmv2_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_cgbmv2_x, BLAS_CGBMV2_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const void *a, int *lda, const void *head_x, const void *tail_x, int *incx,
+ const void *beta, void *y, int *incy, int *prec) {
+ BLAS_cgbmv2_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy,
+ (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_cgbmv2_x.c b/XBLAS/src/gbmv2/BLAS_cgbmv2_x.c
new file mode 100644
index 00000000..2985002f
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_cgbmv2_x.c
@@ -0,0 +1,1688 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_cgbmv2_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) void*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) void*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_cgbmv2_x";
+
+ switch (prec) {
+ case blas_prec_single:{
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = (float *) a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ float tmp1[2];
+ float tmp2[2];
+ float tmp3[2];
+ float tmp4[2];
+ float result[2];
+ float sum1[2];
+ float sum2[2];
+ float prod[2];
+ float a_elem[2];
+ float x_elem[2];
+ float y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] = x_elem[0] * a_elem[0] - x_elem[1] * a_elem[1];
+ prod[1] = x_elem[0] * a_elem[1] + x_elem[1] * a_elem[0];
+ }
+
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] = x_elem[0] * a_elem[0] - x_elem[1] * a_elem[1];
+ prod[1] = x_elem[0] * a_elem[1] + x_elem[1] * a_elem[0];
+ }
+
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] = x_elem[0] * a_elem[0] - x_elem[1] * a_elem[1];
+ prod[1] = x_elem[0] * a_elem[1] + x_elem[1] * a_elem[0];
+ }
+
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] = x_elem[0] * a_elem[0] - x_elem[1] * a_elem[1];
+ prod[1] = x_elem[0] * a_elem[1] + x_elem[1] * a_elem[0];
+ }
+
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] = sum1[0] * alpha_i[0] - sum1[1] * alpha_i[1];
+ tmp1[1] = sum1[0] * alpha_i[1] + sum1[1] * alpha_i[0];
+ }
+
+ {
+ tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
+ tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
+ }
+
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] = beta_i[0] * y_elem[0] - beta_i[1] * y_elem[1];
+ tmp4[1] = beta_i[0] * y_elem[1] + beta_i[1] * y_elem[0];
+ }
+
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+ break;
+ }
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = (float *) a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ float result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ float a_elem[2];
+ float x_elem[2];
+ float y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] =
+ (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] =
+ (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] =
+ (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] =
+ (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = (float *) y;
+ const float *a_i = (float *) a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ float *alpha_i = (float *) alpha;
+ float *beta_i = (float *) beta;
+ double head_tmp1[2], tail_tmp1[2];
+ double head_tmp2[2], tail_tmp2[2];
+ double head_tmp3[2], tail_tmp3[2];
+ double head_tmp4[2], tail_tmp4[2];
+ float result[2];
+ double head_sum1[2], tail_sum1[2];
+ double head_sum2[2], tail_sum2[2];
+ double head_prod[2], tail_prod[2];
+ float a_elem[2];
+ float x_elem[2];
+ float y_elem[2];
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0;
+ head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ double head_e1, tail_e1;
+ double d1;
+ double d2;
+ /* Real part */
+ d1 = (double) x_elem[0] * a_elem[0];
+ d2 = (double) -x_elem[1] * a_elem[1];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[0] = head_e1;
+ tail_prod[0] = tail_e1;
+ /* imaginary part */
+ d1 = (double) x_elem[0] * a_elem[1];
+ d2 = (double) x_elem[1] * a_elem[0];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[1] = head_e1;
+ tail_prod[1] = tail_e1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ double head_e1, tail_e1;
+ double d1;
+ double d2;
+ /* Real part */
+ d1 = (double) x_elem[0] * a_elem[0];
+ d2 = (double) -x_elem[1] * a_elem[1];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[0] = head_e1;
+ tail_prod[0] = tail_e1;
+ /* imaginary part */
+ d1 = (double) x_elem[0] * a_elem[1];
+ d2 = (double) x_elem[1] * a_elem[0];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[1] = head_e1;
+ tail_prod[1] = tail_e1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ double head_e1, tail_e1;
+ double d1;
+ double d2;
+ /* Real part */
+ d1 = (double) x_elem[0] * a_elem[0];
+ d2 = (double) -x_elem[1] * a_elem[1];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[0] = head_e1;
+ tail_prod[0] = tail_e1;
+ /* imaginary part */
+ d1 = (double) x_elem[0] * a_elem[1];
+ d2 = (double) x_elem[1] * a_elem[0];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[1] = head_e1;
+ tail_prod[1] = tail_e1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ double head_e1, tail_e1;
+ double d1;
+ double d2;
+ /* Real part */
+ d1 = (double) x_elem[0] * a_elem[0];
+ d2 = (double) -x_elem[1] * a_elem[1];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[0] = head_e1;
+ tail_prod[0] = tail_e1;
+ /* imaginary part */
+ d1 = (double) x_elem[0] * a_elem[1];
+ d2 = (double) x_elem[1] * a_elem[0];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[1] = head_e1;
+ tail_prod[1] = tail_e1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ double cd[2];
+ cd[0] = (double) alpha_i[0];
+ cd[1] = (double) alpha_i[1];
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum1[0];
+ tail_a0 = tail_sum1[0];
+ head_a1 = head_sum1[1];
+ tail_a1 = tail_sum1[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ c11 = head_a0 * cd[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * cd[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ c11 = head_a1 * cd[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * cd[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[0] = head_t1;
+ tail_tmp1[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ c11 = head_a1 * cd[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * cd[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ c11 = head_a0 * cd[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * cd[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[1] = head_t1;
+ tail_tmp1[1] = tail_t1;
+ }
+
+ }
+ {
+ double cd[2];
+ cd[0] = (double) alpha_i[0];
+ cd[1] = (double) alpha_i[1];
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum2[0];
+ tail_a0 = tail_sum2[0];
+ head_a1 = head_sum2[1];
+ tail_a1 = tail_sum2[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ c11 = head_a0 * cd[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * cd[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ c11 = head_a1 * cd[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * cd[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[0] = head_t1;
+ tail_tmp2[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ c11 = head_a1 * cd[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * cd[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ c11 = head_a0 * cd[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * cd[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[1] = head_t1;
+ tail_tmp2[1] = tail_t1;
+ }
+
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp1[0];
+ tail_a = tail_tmp1[0];
+ head_b = head_tmp2[0];
+ tail_b = tail_tmp2[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[0] = head_t;
+ tail_tmp3[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_tmp1[1];
+ tail_a = tail_tmp1[1];
+ head_b = head_tmp2[1];
+ tail_b = tail_tmp2[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[1] = head_t;
+ tail_tmp3[1] = tail_t;
+ }
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ double head_e1, tail_e1;
+ double d1;
+ double d2;
+ /* Real part */
+ d1 = (double) beta_i[0] * y_elem[0];
+ d2 = (double) -beta_i[1] * y_elem[1];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_tmp4[0] = head_e1;
+ tail_tmp4[0] = tail_e1;
+ /* imaginary part */
+ d1 = (double) beta_i[0] * y_elem[1];
+ d2 = (double) beta_i[1] * y_elem[0];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_tmp4[1] = head_e1;
+ tail_tmp4[1] = tail_e1;
+ }
+ {
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp4[0];
+ tail_a = tail_tmp4[0];
+ head_b = head_tmp3[0];
+ tail_b = tail_tmp3[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[0] = t1 + t2;
+ }
+ /* Imaginary part */
+ head_a = head_tmp4[1];
+ tail_a = tail_tmp4[1];
+ head_b = head_tmp3[1];
+ tail_b = tail_tmp3[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[1] = t1 + t2;
+ }
+ }
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_cgbmv2_x */
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s-f2c.c
new file mode 100644
index 00000000..52ed8941
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_dgbmv2_d_s(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const double *a, int lda, const float *head_x,
+ const float *tail_x, int incx, double beta,
+ double *y, int incy);
+
+
+extern void FC_FUNC_(blas_dgbmv2_d_s, BLAS_DGBMV2_D_S)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, double *alpha,
+ const double *a, int *lda, const float *head_x, const float *tail_x,
+ int *incx, double *beta, double *y, int *incy) {
+ BLAS_dgbmv2_d_s(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y,
+ *incy);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s.c
new file mode 100644
index 00000000..0c70c784
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s.c
@@ -0,0 +1,254 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_dgbmv2_d_s(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const double *a, int lda, const float *head_x,
+ const float *tail_x, int incx, double beta,
+ double *y, int incy)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) double
+ *
+ * AB (input) double*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) float*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) double
+ *
+ * y (input) const double*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_dgbmv2_d_s";
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const double *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double tmp1;
+ double tmp2;
+ double tmp3;
+ double tmp4;
+ double result;
+ double sum1;
+ double sum2;
+ double prod;
+ double a_elem;
+ float x_elem;
+ double y_elem;
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = x_elem * a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = x_elem * a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ tmp1 = sum1 * alpha_i;
+ tmp2 = sum2 * alpha_i;
+ tmp3 = tmp1 + tmp2;
+ y_elem = y_i[iy];
+ tmp4 = beta_i * y_elem;
+ result = tmp4 + tmp3;
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+} /* end BLAS_dgbmv2_d_s */
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x-f2c.c
new file mode 100644
index 00000000..bca44d58
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_dgbmv2_d_s_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const double *a, int lda, const float *head_x,
+ const float *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_dgbmv2_d_s_x, BLAS_DGBMV2_D_S_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, double *alpha,
+ const double *a, int *lda, const float *head_x, const float *tail_x,
+ int *incx, double *beta, double *y, int *incy, int *prec) {
+ BLAS_dgbmv2_d_s_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y,
+ *incy, (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x.c
new file mode 100644
index 00000000..bd27105e
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_d_s_x.c
@@ -0,0 +1,620 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_dgbmv2_d_s_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const double *a, int lda, const float *head_x,
+ const float *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) double
+ *
+ * AB (input) double*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) float*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) double
+ *
+ * y (input) const double*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_dgbmv2_d_s_x";
+
+ switch (prec) {
+ case blas_prec_single:
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const double *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double tmp1;
+ double tmp2;
+ double tmp3;
+ double tmp4;
+ double result;
+ double sum1;
+ double sum2;
+ double prod;
+ double a_elem;
+ float x_elem;
+ double y_elem;
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = x_elem * a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = x_elem * a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ tmp1 = sum1 * alpha_i;
+ tmp2 = sum2 * alpha_i;
+ tmp3 = tmp1 + tmp2;
+ y_elem = y_i[iy];
+ tmp4 = beta_i * y_elem;
+ result = tmp4 + tmp3;
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const double *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double head_tmp1, tail_tmp1;
+ double head_tmp2, tail_tmp2;
+ double head_tmp3, tail_tmp3;
+ double head_tmp4, tail_tmp4;
+ double result;
+ double head_sum1, tail_sum1;
+ double head_sum2, tail_sum2;
+ double head_prod, tail_prod;
+ double a_elem;
+ float x_elem;
+ double y_elem;
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1 = tail_sum1 = 0.0;
+ head_sum2 = tail_sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ {
+ double dt = (double) x_elem;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = dt * split;
+ a1 = con - dt;
+ a1 = con - a1;
+ a2 = dt - a1;
+ con = a_elem * split;
+ b1 = con - a_elem;
+ b1 = con - b1;
+ b2 = a_elem - b1;
+
+ head_prod = dt * a_elem;
+ tail_prod =
+ (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum1 + head_prod;
+ bv = s1 - head_sum1;
+ s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum1 + tail_prod;
+ bv = t1 - tail_sum1;
+ t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum1 = t1 + t2;
+ tail_sum1 = t2 - (head_sum1 - t1);
+ }
+ x_elem = tail_x_i[jx];
+ {
+ double dt = (double) x_elem;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = dt * split;
+ a1 = con - dt;
+ a1 = con - a1;
+ a2 = dt - a1;
+ con = a_elem * split;
+ b1 = con - a_elem;
+ b1 = con - b1;
+ b2 = a_elem - b1;
+
+ head_prod = dt * a_elem;
+ tail_prod =
+ (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum2 + head_prod;
+ bv = s1 - head_sum2;
+ s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum2 + tail_prod;
+ bv = t1 - tail_sum2;
+ t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum2 = t1 + t2;
+ tail_sum2 = t2 - (head_sum2 - t1);
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum1 * split;
+ a11 = con - head_sum1;
+ a11 = con - a11;
+ a21 = head_sum1 - a11;
+ con = alpha_i * split;
+ b1 = con - alpha_i;
+ b1 = con - b1;
+ b2 = alpha_i - b1;
+
+ c11 = head_sum1 * alpha_i;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum1 * alpha_i;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_tmp1 = t1 + t2;
+ tail_tmp1 = t2 - (head_tmp1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum2 * split;
+ a11 = con - head_sum2;
+ a11 = con - a11;
+ a21 = head_sum2 - a11;
+ con = alpha_i * split;
+ b1 = con - alpha_i;
+ b1 = con - b1;
+ b2 = alpha_i - b1;
+
+ c11 = head_sum2 * alpha_i;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum2 * alpha_i;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_tmp2 = t1 + t2;
+ tail_tmp2 = t2 - (head_tmp2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_tmp1 + head_tmp2;
+ bv = s1 - head_tmp1;
+ s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_tmp1 + tail_tmp2;
+ bv = t1 - tail_tmp1;
+ t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_tmp3 = t1 + t2;
+ tail_tmp3 = t2 - (head_tmp3 - t1);
+ }
+ y_elem = y_i[iy];
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i * split;
+ a1 = con - beta_i;
+ a1 = con - a1;
+ a2 = beta_i - a1;
+ con = y_elem * split;
+ b1 = con - y_elem;
+ b1 = con - b1;
+ b2 = y_elem - b1;
+
+ head_tmp4 = beta_i * y_elem;
+ tail_tmp4 = (((a1 * b1 - head_tmp4) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_tmp4 + head_tmp3;
+ bv = s1 - head_tmp4;
+ s2 = ((head_tmp3 - bv) + (head_tmp4 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_tmp4 + tail_tmp3;
+ bv = t1 - tail_tmp4;
+ t2 = ((tail_tmp3 - bv) + (tail_tmp4 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result = t1 + t2;
+ }
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_dgbmv2_d_s_x */
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d-f2c.c
new file mode 100644
index 00000000..3fc22560
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_dgbmv2_s_d(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const double *head_x,
+ const double *tail_x, int incx, double beta,
+ double *y, int incy);
+
+
+extern void FC_FUNC_(blas_dgbmv2_s_d, BLAS_DGBMV2_S_D)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, double *alpha,
+ const float *a, int *lda, const double *head_x, const double *tail_x,
+ int *incx, double *beta, double *y, int *incy) {
+ BLAS_dgbmv2_s_d(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y,
+ *incy);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d.c
new file mode 100644
index 00000000..8b840787
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d.c
@@ -0,0 +1,254 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_dgbmv2_s_d(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const double *head_x,
+ const double *tail_x, int incx, double beta,
+ double *y, int incy)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) double
+ *
+ * AB (input) float*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) double*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) double
+ *
+ * y (input) const double*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_dgbmv2_s_d";
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const float *a_i = a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double tmp1;
+ double tmp2;
+ double tmp3;
+ double tmp4;
+ double result;
+ double sum1;
+ double sum2;
+ double prod;
+ float a_elem;
+ double x_elem;
+ double y_elem;
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = x_elem * a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = x_elem * a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ tmp1 = sum1 * alpha_i;
+ tmp2 = sum2 * alpha_i;
+ tmp3 = tmp1 + tmp2;
+ y_elem = y_i[iy];
+ tmp4 = beta_i * y_elem;
+ result = tmp4 + tmp3;
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+} /* end BLAS_dgbmv2_s_d */
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x-f2c.c
new file mode 100644
index 00000000..e02f9fd5
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_dgbmv2_s_d_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const double *head_x,
+ const double *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_dgbmv2_s_d_x, BLAS_DGBMV2_S_D_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, double *alpha,
+ const float *a, int *lda, const double *head_x, const double *tail_x,
+ int *incx, double *beta, double *y, int *incy, int *prec) {
+ BLAS_dgbmv2_s_d_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y,
+ *incy, (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c
new file mode 100644
index 00000000..eda4da18
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_d_x.c
@@ -0,0 +1,620 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_dgbmv2_s_d_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const double *head_x,
+ const double *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) double
+ *
+ * AB (input) float*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) double*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) double
+ *
+ * y (input) const double*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_dgbmv2_s_d_x";
+
+ switch (prec) {
+ case blas_prec_single:
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const float *a_i = a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double tmp1;
+ double tmp2;
+ double tmp3;
+ double tmp4;
+ double result;
+ double sum1;
+ double sum2;
+ double prod;
+ float a_elem;
+ double x_elem;
+ double y_elem;
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = x_elem * a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = x_elem * a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ tmp1 = sum1 * alpha_i;
+ tmp2 = sum2 * alpha_i;
+ tmp3 = tmp1 + tmp2;
+ y_elem = y_i[iy];
+ tmp4 = beta_i * y_elem;
+ result = tmp4 + tmp3;
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const float *a_i = a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double head_tmp1, tail_tmp1;
+ double head_tmp2, tail_tmp2;
+ double head_tmp3, tail_tmp3;
+ double head_tmp4, tail_tmp4;
+ double result;
+ double head_sum1, tail_sum1;
+ double head_sum2, tail_sum2;
+ double head_prod, tail_prod;
+ float a_elem;
+ double x_elem;
+ double y_elem;
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1 = tail_sum1 = 0.0;
+ head_sum2 = tail_sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ {
+ double dt = (double) a_elem;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = dt * split;
+ b1 = con - dt;
+ b1 = con - b1;
+ b2 = dt - b1;
+
+ head_prod = x_elem * dt;
+ tail_prod =
+ (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum1 + head_prod;
+ bv = s1 - head_sum1;
+ s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum1 + tail_prod;
+ bv = t1 - tail_sum1;
+ t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum1 = t1 + t2;
+ tail_sum1 = t2 - (head_sum1 - t1);
+ }
+ x_elem = tail_x_i[jx];
+ {
+ double dt = (double) a_elem;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = dt * split;
+ b1 = con - dt;
+ b1 = con - b1;
+ b2 = dt - b1;
+
+ head_prod = x_elem * dt;
+ tail_prod =
+ (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum2 + head_prod;
+ bv = s1 - head_sum2;
+ s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum2 + tail_prod;
+ bv = t1 - tail_sum2;
+ t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum2 = t1 + t2;
+ tail_sum2 = t2 - (head_sum2 - t1);
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum1 * split;
+ a11 = con - head_sum1;
+ a11 = con - a11;
+ a21 = head_sum1 - a11;
+ con = alpha_i * split;
+ b1 = con - alpha_i;
+ b1 = con - b1;
+ b2 = alpha_i - b1;
+
+ c11 = head_sum1 * alpha_i;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum1 * alpha_i;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_tmp1 = t1 + t2;
+ tail_tmp1 = t2 - (head_tmp1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum2 * split;
+ a11 = con - head_sum2;
+ a11 = con - a11;
+ a21 = head_sum2 - a11;
+ con = alpha_i * split;
+ b1 = con - alpha_i;
+ b1 = con - b1;
+ b2 = alpha_i - b1;
+
+ c11 = head_sum2 * alpha_i;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum2 * alpha_i;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_tmp2 = t1 + t2;
+ tail_tmp2 = t2 - (head_tmp2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_tmp1 + head_tmp2;
+ bv = s1 - head_tmp1;
+ s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_tmp1 + tail_tmp2;
+ bv = t1 - tail_tmp1;
+ t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_tmp3 = t1 + t2;
+ tail_tmp3 = t2 - (head_tmp3 - t1);
+ }
+ y_elem = y_i[iy];
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i * split;
+ a1 = con - beta_i;
+ a1 = con - a1;
+ a2 = beta_i - a1;
+ con = y_elem * split;
+ b1 = con - y_elem;
+ b1 = con - b1;
+ b2 = y_elem - b1;
+
+ head_tmp4 = beta_i * y_elem;
+ tail_tmp4 = (((a1 * b1 - head_tmp4) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_tmp4 + head_tmp3;
+ bv = s1 - head_tmp4;
+ s2 = ((head_tmp3 - bv) + (head_tmp4 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_tmp4 + tail_tmp3;
+ bv = t1 - tail_tmp4;
+ t2 = ((tail_tmp3 - bv) + (tail_tmp4 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result = t1 + t2;
+ }
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_dgbmv2_s_d_x */
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s-f2c.c
new file mode 100644
index 00000000..f8d78cd8
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_dgbmv2_s_s(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, double beta,
+ double *y, int incy);
+
+
+extern void FC_FUNC_(blas_dgbmv2_s_s, BLAS_DGBMV2_S_S)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, double *alpha,
+ const float *a, int *lda, const float *head_x, const float *tail_x,
+ int *incx, double *beta, double *y, int *incy) {
+ BLAS_dgbmv2_s_s(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y,
+ *incy);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s.c
new file mode 100644
index 00000000..fa29f394
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s.c
@@ -0,0 +1,254 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_dgbmv2_s_s(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, double beta,
+ double *y, int incy)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) double
+ *
+ * AB (input) float*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) float*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) double
+ *
+ * y (input) const double*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_dgbmv2_s_s";
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const float *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double tmp1;
+ double tmp2;
+ double tmp3;
+ double tmp4;
+ double result;
+ double sum1;
+ double sum2;
+ double prod;
+ float a_elem;
+ float x_elem;
+ double y_elem;
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = (double) x_elem *a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = (double) x_elem *a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ tmp1 = sum1 * alpha_i;
+ tmp2 = sum2 * alpha_i;
+ tmp3 = tmp1 + tmp2;
+ y_elem = y_i[iy];
+ tmp4 = beta_i * y_elem;
+ result = tmp4 + tmp3;
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+} /* end BLAS_dgbmv2_s_s */
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x-f2c.c
new file mode 100644
index 00000000..cac401bf
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_dgbmv2_s_s_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_dgbmv2_s_s_x, BLAS_DGBMV2_S_S_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, double *alpha,
+ const float *a, int *lda, const float *head_x, const float *tail_x,
+ int *incx, double *beta, double *y, int *incy, int *prec) {
+ BLAS_dgbmv2_s_s_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y,
+ *incy, (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x.c
new file mode 100644
index 00000000..8eb8369e
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_s_s_x.c
@@ -0,0 +1,584 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_dgbmv2_s_s_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) double
+ *
+ * AB (input) float*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) float*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) double
+ *
+ * y (input) const double*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_dgbmv2_s_s_x";
+
+ switch (prec) {
+ case blas_prec_single:
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const float *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double tmp1;
+ double tmp2;
+ double tmp3;
+ double tmp4;
+ double result;
+ double sum1;
+ double sum2;
+ double prod;
+ float a_elem;
+ float x_elem;
+ double y_elem;
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = (double) x_elem *a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = (double) x_elem *a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ tmp1 = sum1 * alpha_i;
+ tmp2 = sum2 * alpha_i;
+ tmp3 = tmp1 + tmp2;
+ y_elem = y_i[iy];
+ tmp4 = beta_i * y_elem;
+ result = tmp4 + tmp3;
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const float *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double head_tmp1, tail_tmp1;
+ double head_tmp2, tail_tmp2;
+ double head_tmp3, tail_tmp3;
+ double head_tmp4, tail_tmp4;
+ double result;
+ double head_sum1, tail_sum1;
+ double head_sum2, tail_sum2;
+ double head_prod, tail_prod;
+ float a_elem;
+ float x_elem;
+ double y_elem;
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1 = tail_sum1 = 0.0;
+ head_sum2 = tail_sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ head_prod = (double) x_elem *a_elem;
+ tail_prod = 0.0;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum1 + head_prod;
+ bv = s1 - head_sum1;
+ s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum1 + tail_prod;
+ bv = t1 - tail_sum1;
+ t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum1 = t1 + t2;
+ tail_sum1 = t2 - (head_sum1 - t1);
+ }
+ x_elem = tail_x_i[jx];
+ head_prod = (double) x_elem *a_elem;
+ tail_prod = 0.0;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum2 + head_prod;
+ bv = s1 - head_sum2;
+ s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum2 + tail_prod;
+ bv = t1 - tail_sum2;
+ t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum2 = t1 + t2;
+ tail_sum2 = t2 - (head_sum2 - t1);
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum1 * split;
+ a11 = con - head_sum1;
+ a11 = con - a11;
+ a21 = head_sum1 - a11;
+ con = alpha_i * split;
+ b1 = con - alpha_i;
+ b1 = con - b1;
+ b2 = alpha_i - b1;
+
+ c11 = head_sum1 * alpha_i;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum1 * alpha_i;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_tmp1 = t1 + t2;
+ tail_tmp1 = t2 - (head_tmp1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum2 * split;
+ a11 = con - head_sum2;
+ a11 = con - a11;
+ a21 = head_sum2 - a11;
+ con = alpha_i * split;
+ b1 = con - alpha_i;
+ b1 = con - b1;
+ b2 = alpha_i - b1;
+
+ c11 = head_sum2 * alpha_i;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum2 * alpha_i;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_tmp2 = t1 + t2;
+ tail_tmp2 = t2 - (head_tmp2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_tmp1 + head_tmp2;
+ bv = s1 - head_tmp1;
+ s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_tmp1 + tail_tmp2;
+ bv = t1 - tail_tmp1;
+ t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_tmp3 = t1 + t2;
+ tail_tmp3 = t2 - (head_tmp3 - t1);
+ }
+ y_elem = y_i[iy];
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i * split;
+ a1 = con - beta_i;
+ a1 = con - a1;
+ a2 = beta_i - a1;
+ con = y_elem * split;
+ b1 = con - y_elem;
+ b1 = con - b1;
+ b2 = y_elem - b1;
+
+ head_tmp4 = beta_i * y_elem;
+ tail_tmp4 = (((a1 * b1 - head_tmp4) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_tmp4 + head_tmp3;
+ bv = s1 - head_tmp4;
+ s2 = ((head_tmp3 - bv) + (head_tmp4 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_tmp4 + tail_tmp3;
+ bv = t1 - tail_tmp4;
+ t2 = ((tail_tmp3 - bv) + (tail_tmp4 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result = t1 + t2;
+ }
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_dgbmv2_s_s_x */
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_x-f2c.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_x-f2c.c
new file mode 100644
index 00000000..983c9585
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_dgbmv2_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const double *a, int lda, const double *head_x,
+ const double *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_dgbmv2_x, BLAS_DGBMV2_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, double *alpha,
+ const double *a, int *lda, const double *head_x, const double *tail_x,
+ int *incx, double *beta, double *y, int *incy, int *prec) {
+ BLAS_dgbmv2_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y, *incy,
+ (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_dgbmv2_x.c b/XBLAS/src/gbmv2/BLAS_dgbmv2_x.c
new file mode 100644
index 00000000..e66c3edf
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_dgbmv2_x.c
@@ -0,0 +1,614 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_dgbmv2_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, double alpha,
+ const double *a, int lda, const double *head_x,
+ const double *tail_x, int incx, double beta,
+ double *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) double
+ *
+ * AB (input) double*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) double*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) double
+ *
+ * y (input) const double*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_dgbmv2_x";
+
+ switch (prec) {
+ case blas_prec_single:
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const double *a_i = a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double tmp1;
+ double tmp2;
+ double tmp3;
+ double tmp4;
+ double result;
+ double sum1;
+ double sum2;
+ double prod;
+ double a_elem;
+ double x_elem;
+ double y_elem;
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = x_elem * a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = x_elem * a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ tmp1 = sum1 * alpha_i;
+ tmp2 = sum2 * alpha_i;
+ tmp3 = tmp1 + tmp2;
+ y_elem = y_i[iy];
+ tmp4 = beta_i * y_elem;
+ result = tmp4 + tmp3;
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = y;
+ const double *a_i = a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double alpha_i = alpha;
+ double beta_i = beta;
+ double head_tmp1, tail_tmp1;
+ double head_tmp2, tail_tmp2;
+ double head_tmp3, tail_tmp3;
+ double head_tmp4, tail_tmp4;
+ double result;
+ double head_sum1, tail_sum1;
+ double head_sum2, tail_sum2;
+ double head_prod, tail_prod;
+ double a_elem;
+ double x_elem;
+ double y_elem;
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1 = tail_sum1 = 0.0;
+ head_sum2 = tail_sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = a_elem * split;
+ b1 = con - a_elem;
+ b1 = con - b1;
+ b2 = a_elem - b1;
+
+ head_prod = x_elem * a_elem;
+ tail_prod =
+ (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum1 + head_prod;
+ bv = s1 - head_sum1;
+ s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum1 + tail_prod;
+ bv = t1 - tail_sum1;
+ t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum1 = t1 + t2;
+ tail_sum1 = t2 - (head_sum1 - t1);
+ }
+ x_elem = tail_x_i[jx];
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = a_elem * split;
+ b1 = con - a_elem;
+ b1 = con - b1;
+ b2 = a_elem - b1;
+
+ head_prod = x_elem * a_elem;
+ tail_prod =
+ (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum2 + head_prod;
+ bv = s1 - head_sum2;
+ s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum2 + tail_prod;
+ bv = t1 - tail_sum2;
+ t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum2 = t1 + t2;
+ tail_sum2 = t2 - (head_sum2 - t1);
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum1 * split;
+ a11 = con - head_sum1;
+ a11 = con - a11;
+ a21 = head_sum1 - a11;
+ con = alpha_i * split;
+ b1 = con - alpha_i;
+ b1 = con - b1;
+ b2 = alpha_i - b1;
+
+ c11 = head_sum1 * alpha_i;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum1 * alpha_i;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_tmp1 = t1 + t2;
+ tail_tmp1 = t2 - (head_tmp1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum2 * split;
+ a11 = con - head_sum2;
+ a11 = con - a11;
+ a21 = head_sum2 - a11;
+ con = alpha_i * split;
+ b1 = con - alpha_i;
+ b1 = con - b1;
+ b2 = alpha_i - b1;
+
+ c11 = head_sum2 * alpha_i;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum2 * alpha_i;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_tmp2 = t1 + t2;
+ tail_tmp2 = t2 - (head_tmp2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_tmp1 + head_tmp2;
+ bv = s1 - head_tmp1;
+ s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_tmp1 + tail_tmp2;
+ bv = t1 - tail_tmp1;
+ t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_tmp3 = t1 + t2;
+ tail_tmp3 = t2 - (head_tmp3 - t1);
+ }
+ y_elem = y_i[iy];
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i * split;
+ a1 = con - beta_i;
+ a1 = con - a1;
+ a2 = beta_i - a1;
+ con = y_elem * split;
+ b1 = con - y_elem;
+ b1 = con - b1;
+ b2 = y_elem - b1;
+
+ head_tmp4 = beta_i * y_elem;
+ tail_tmp4 = (((a1 * b1 - head_tmp4) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_tmp4 + head_tmp3;
+ bv = s1 - head_tmp4;
+ s2 = ((head_tmp3 - bv) + (head_tmp4 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_tmp4 + tail_tmp3;
+ bv = t1 - tail_tmp4;
+ t2 = ((tail_tmp3 - bv) + (tail_tmp4 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result = t1 + t2;
+ }
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_dgbmv2_x */
diff --git a/XBLAS/src/gbmv2/BLAS_sgbmv2_x-f2c.c b/XBLAS/src/gbmv2/BLAS_sgbmv2_x-f2c.c
new file mode 100644
index 00000000..6454c4f5
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_sgbmv2_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_sgbmv2_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, float alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, float beta,
+ float *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_sgbmv2_x, BLAS_SGBMV2_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, float *alpha, const float *a,
+ int *lda, const float *head_x, const float *tail_x, int *incx, float *beta,
+ float *y, int *incy, int *prec) {
+ BLAS_sgbmv2_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, *alpha, a, *lda, head_x, tail_x, *incx, *beta, y, *incy,
+ (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_sgbmv2_x.c b/XBLAS/src/gbmv2/BLAS_sgbmv2_x.c
new file mode 100644
index 00000000..c5397379
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_sgbmv2_x.c
@@ -0,0 +1,727 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_sgbmv2_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, float alpha,
+ const float *a, int lda, const float *head_x,
+ const float *tail_x, int incx, float beta,
+ float *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) float
+ *
+ * AB (input) float*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) float*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) float
+ *
+ * y (input) const float*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_sgbmv2_x";
+
+ switch (prec) {
+ case blas_prec_single:{
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = y;
+ const float *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ float alpha_i = alpha;
+ float beta_i = beta;
+ float tmp1;
+ float tmp2;
+ float tmp3;
+ float tmp4;
+ float result;
+ float sum1;
+ float sum2;
+ float prod;
+ float a_elem;
+ float x_elem;
+ float y_elem;
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = x_elem * a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = x_elem * a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ tmp1 = sum1 * alpha_i;
+ tmp2 = sum2 * alpha_i;
+ tmp3 = tmp1 + tmp2;
+ y_elem = y_i[iy];
+ tmp4 = beta_i * y_elem;
+ result = tmp4 + tmp3;
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+ break;
+ }
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = y;
+ const float *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ float alpha_i = alpha;
+ float beta_i = beta;
+ double tmp1;
+ double tmp2;
+ double tmp3;
+ double tmp4;
+ float result;
+ double sum1;
+ double sum2;
+ double prod;
+ float a_elem;
+ float x_elem;
+ float y_elem;
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = (double) x_elem *a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = (double) x_elem *a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ tmp1 = sum1 * alpha_i;
+ tmp2 = sum2 * alpha_i;
+ tmp3 = tmp1 + tmp2;
+ y_elem = y_i[iy];
+ tmp4 = (double) beta_i *y_elem;
+ result = tmp4 + tmp3;
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ float *y_i = y;
+ const float *a_i = a;
+ const float *head_x_i = head_x;
+ const float *tail_x_i = tail_x;
+ float alpha_i = alpha;
+ float beta_i = beta;
+ double head_tmp1, tail_tmp1;
+ double head_tmp2, tail_tmp2;
+ double head_tmp3, tail_tmp3;
+ double head_tmp4, tail_tmp4;
+ float result;
+ double head_sum1, tail_sum1;
+ double head_sum2, tail_sum2;
+ double head_prod, tail_prod;
+ float a_elem;
+ float x_elem;
+ float y_elem;
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i == 0.0) && (beta_i == 1.0))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+
+
+
+
+
+
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1 = tail_sum1 = 0.0;
+ head_sum2 = tail_sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ head_prod = (double) x_elem *a_elem;
+ tail_prod = 0.0;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum1 + head_prod;
+ bv = s1 - head_sum1;
+ s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum1 + tail_prod;
+ bv = t1 - tail_sum1;
+ t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum1 = t1 + t2;
+ tail_sum1 = t2 - (head_sum1 - t1);
+ }
+ x_elem = tail_x_i[jx];
+ head_prod = (double) x_elem *a_elem;
+ tail_prod = 0.0;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum2 + head_prod;
+ bv = s1 - head_sum2;
+ s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum2 + tail_prod;
+ bv = t1 - tail_sum2;
+ t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum2 = t1 + t2;
+ tail_sum2 = t2 - (head_sum2 - t1);
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ double dt = (double) alpha_i;
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum1 * split;
+ a11 = con - head_sum1;
+ a11 = con - a11;
+ a21 = head_sum1 - a11;
+ con = dt * split;
+ b1 = con - dt;
+ b1 = con - b1;
+ b2 = dt - b1;
+
+ c11 = head_sum1 * dt;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum1 * dt;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_tmp1 = t1 + t2;
+ tail_tmp1 = t2 - (head_tmp1 - t1);
+ }
+ }
+ {
+ double dt = (double) alpha_i;
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum2 * split;
+ a11 = con - head_sum2;
+ a11 = con - a11;
+ a21 = head_sum2 - a11;
+ con = dt * split;
+ b1 = con - dt;
+ b1 = con - b1;
+ b2 = dt - b1;
+
+ c11 = head_sum2 * dt;
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum2 * dt;
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_tmp2 = t1 + t2;
+ tail_tmp2 = t2 - (head_tmp2 - t1);
+ }
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_tmp1 + head_tmp2;
+ bv = s1 - head_tmp1;
+ s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_tmp1 + tail_tmp2;
+ bv = t1 - tail_tmp1;
+ t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_tmp3 = t1 + t2;
+ tail_tmp3 = t2 - (head_tmp3 - t1);
+ }
+ y_elem = y_i[iy];
+ head_tmp4 = (double) beta_i *y_elem;
+ tail_tmp4 = 0.0;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_tmp4 + head_tmp3;
+ bv = s1 - head_tmp4;
+ s2 = ((head_tmp3 - bv) + (head_tmp4 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_tmp4 + tail_tmp3;
+ bv = t1 - tail_tmp4;
+ t2 = ((tail_tmp3 - bv) + (tail_tmp4 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result = t1 + t2;
+ }
+ y_i[iy] = result;
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_sgbmv2_x */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c-f2c.c
new file mode 100644
index 00000000..2165e22b
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c-f2c.c
@@ -0,0 +1,18 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_c_c(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy);
+
+
+extern void FC_FUNC_(blas_zgbmv2_c_c, BLAS_ZGBMV2_C_C)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const void *a, int *lda, const void *head_x, const void *tail_x, int *incx,
+ const void *beta, void *y, int *incy) {
+ BLAS_zgbmv2_c_c(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c.c
new file mode 100644
index 00000000..6da002ea
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c.c
@@ -0,0 +1,314 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_c_c(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) void*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) void*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_c_c";
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const float *a_i = (float *) a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ float a_elem[2];
+ float x_elem[2];
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] = (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] = (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] = (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] = (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+} /* end BLAS_zgbmv2_c_c */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x-f2c.c
new file mode 100644
index 00000000..368eec3b
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_c_c_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_zgbmv2_c_c_x, BLAS_ZGBMV2_C_C_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const void *a, int *lda, const void *head_x, const void *tail_x, int *incx,
+ const void *beta, void *y, int *incy, int *prec) {
+ BLAS_zgbmv2_c_c_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y,
+ *incy, (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x.c
new file mode 100644
index 00000000..108a3062
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_c_x.c
@@ -0,0 +1,1560 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_c_c_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) void*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) void*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_c_c_x";
+
+ switch (prec) {
+ case blas_prec_single:
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const float *a_i = (float *) a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ float a_elem[2];
+ float x_elem[2];
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] =
+ (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] =
+ (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] =
+ (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] =
+ (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const float *a_i = (float *) a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double head_tmp1[2], tail_tmp1[2];
+ double head_tmp2[2], tail_tmp2[2];
+ double head_tmp3[2], tail_tmp3[2];
+ double head_tmp4[2], tail_tmp4[2];
+ double result[2];
+ double head_sum1[2], tail_sum1[2];
+ double head_sum2[2], tail_sum2[2];
+ double head_prod[2], tail_prod[2];
+ float a_elem[2];
+ float x_elem[2];
+ double y_elem[2];
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0;
+ head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ double head_e1, tail_e1;
+ double d1;
+ double d2;
+ /* Real part */
+ d1 = (double) x_elem[0] * a_elem[0];
+ d2 = (double) -x_elem[1] * a_elem[1];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[0] = head_e1;
+ tail_prod[0] = tail_e1;
+ /* imaginary part */
+ d1 = (double) x_elem[0] * a_elem[1];
+ d2 = (double) x_elem[1] * a_elem[0];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[1] = head_e1;
+ tail_prod[1] = tail_e1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ double head_e1, tail_e1;
+ double d1;
+ double d2;
+ /* Real part */
+ d1 = (double) x_elem[0] * a_elem[0];
+ d2 = (double) -x_elem[1] * a_elem[1];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[0] = head_e1;
+ tail_prod[0] = tail_e1;
+ /* imaginary part */
+ d1 = (double) x_elem[0] * a_elem[1];
+ d2 = (double) x_elem[1] * a_elem[0];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[1] = head_e1;
+ tail_prod[1] = tail_e1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ double head_e1, tail_e1;
+ double d1;
+ double d2;
+ /* Real part */
+ d1 = (double) x_elem[0] * a_elem[0];
+ d2 = (double) -x_elem[1] * a_elem[1];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[0] = head_e1;
+ tail_prod[0] = tail_e1;
+ /* imaginary part */
+ d1 = (double) x_elem[0] * a_elem[1];
+ d2 = (double) x_elem[1] * a_elem[0];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[1] = head_e1;
+ tail_prod[1] = tail_e1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ double head_e1, tail_e1;
+ double d1;
+ double d2;
+ /* Real part */
+ d1 = (double) x_elem[0] * a_elem[0];
+ d2 = (double) -x_elem[1] * a_elem[1];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[0] = head_e1;
+ tail_prod[0] = tail_e1;
+ /* imaginary part */
+ d1 = (double) x_elem[0] * a_elem[1];
+ d2 = (double) x_elem[1] * a_elem[0];
+ {
+ /* Compute double-double = double + double. */
+ double e, t1, t2;
+
+ /* Knuth trick. */
+ t1 = d1 + d2;
+ e = t1 - d1;
+ t2 = ((d2 - e) + (d1 - (t1 - e)));
+
+ /* The result is t1 + t2, after normalization. */
+ head_e1 = t1 + t2;
+ tail_e1 = t2 - (head_e1 - t1);
+ }
+ head_prod[1] = head_e1;
+ tail_prod[1] = tail_e1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum1[0];
+ tail_a0 = tail_sum1[0];
+ head_a1 = head_sum1[1];
+ tail_a1 = tail_sum1[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a0 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[0] = head_t1;
+ tail_tmp1[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a0 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[1] = head_t1;
+ tail_tmp1[1] = tail_t1;
+ }
+
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum2[0];
+ tail_a0 = tail_sum2[0];
+ head_a1 = head_sum2[1];
+ tail_a1 = tail_sum2[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a0 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[0] = head_t1;
+ tail_tmp2[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a0 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[1] = head_t1;
+ tail_tmp2[1] = tail_t1;
+ }
+
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp1[0];
+ tail_a = tail_tmp1[0];
+ head_b = head_tmp2[0];
+ tail_b = tail_tmp2[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[0] = head_t;
+ tail_tmp3[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_tmp1[1];
+ tail_a = tail_tmp1[1];
+ head_b = head_tmp2[1];
+ tail_b = tail_tmp2[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[1] = head_t;
+ tail_tmp3[1] = tail_t;
+ }
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[0] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[1] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[0] = head_t1;
+ tail_tmp4[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[1] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[0] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[1] = head_t1;
+ tail_tmp4[1] = tail_t1;
+ }
+ {
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp4[0];
+ tail_a = tail_tmp4[0];
+ head_b = head_tmp3[0];
+ tail_b = tail_tmp3[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[0] = t1 + t2;
+ }
+ /* Imaginary part */
+ head_a = head_tmp4[1];
+ tail_a = tail_tmp4[1];
+ head_b = head_tmp3[1];
+ tail_b = tail_tmp3[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[1] = t1 + t2;
+ }
+ }
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_zgbmv2_c_c_x */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z-f2c.c
new file mode 100644
index 00000000..df7b7cc8
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z-f2c.c
@@ -0,0 +1,18 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_c_z(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy);
+
+
+extern void FC_FUNC_(blas_zgbmv2_c_z, BLAS_ZGBMV2_C_Z)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const void *a, int *lda, const void *head_x, const void *tail_x, int *incx,
+ const void *beta, void *y, int *incy) {
+ BLAS_zgbmv2_c_z(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z.c
new file mode 100644
index 00000000..bb4ad742
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z.c
@@ -0,0 +1,314 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_c_z(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) void*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) void*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_c_z";
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const float *a_i = (float *) a;
+ const double *head_x_i = (double *) head_x;
+ const double *tail_x_i = (double *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ float a_elem[2];
+ double x_elem[2];
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] = (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] = (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] = (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] = (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+} /* end BLAS_zgbmv2_c_z */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x-f2c.c
new file mode 100644
index 00000000..b29d074d
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_c_z_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_zgbmv2_c_z_x, BLAS_ZGBMV2_C_Z_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const void *a, int *lda, const void *head_x, const void *tail_x, int *incx,
+ const void *beta, void *y, int *incy, int *prec) {
+ BLAS_zgbmv2_c_z_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y,
+ *incy, (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x.c
new file mode 100644
index 00000000..4772c683
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_c_z_x.c
@@ -0,0 +1,1956 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_c_z_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) void*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) void*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_c_z_x";
+
+ switch (prec) {
+ case blas_prec_single:
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const float *a_i = (float *) a;
+ const double *head_x_i = (double *) head_x;
+ const double *tail_x_i = (double *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ float a_elem[2];
+ double x_elem[2];
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] =
+ (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] =
+ (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] =
+ (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] =
+ (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const float *a_i = (float *) a;
+ const double *head_x_i = (double *) head_x;
+ const double *tail_x_i = (double *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double head_tmp1[2], tail_tmp1[2];
+ double head_tmp2[2], tail_tmp2[2];
+ double head_tmp3[2], tail_tmp3[2];
+ double head_tmp4[2], tail_tmp4[2];
+ double result[2];
+ double head_sum1[2], tail_sum1[2];
+ double head_sum2[2], tail_sum2[2];
+ double head_prod[2], tail_prod[2];
+ float a_elem[2];
+ double x_elem[2];
+ double y_elem[2];
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0;
+ head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ double cd[2];
+ cd[0] = (double) a_elem[0];
+ cd[1] = (double) a_elem[1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = x_elem[0] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = x_elem[1] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[0] = head_t1;
+ tail_prod[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = x_elem[1] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = x_elem[0] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[1] = head_t1;
+ tail_prod[1] = tail_t1;
+ }
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ double cd[2];
+ cd[0] = (double) a_elem[0];
+ cd[1] = (double) a_elem[1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = x_elem[0] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = x_elem[1] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[0] = head_t1;
+ tail_prod[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = x_elem[1] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = x_elem[0] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[1] = head_t1;
+ tail_prod[1] = tail_t1;
+ }
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ double cd[2];
+ cd[0] = (double) a_elem[0];
+ cd[1] = (double) a_elem[1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = x_elem[0] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = x_elem[1] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[0] = head_t1;
+ tail_prod[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = x_elem[1] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = x_elem[0] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[1] = head_t1;
+ tail_prod[1] = tail_t1;
+ }
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ double cd[2];
+ cd[0] = (double) a_elem[0];
+ cd[1] = (double) a_elem[1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = x_elem[0] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = x_elem[1] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[0] = head_t1;
+ tail_prod[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = x_elem[1] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = x_elem[0] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[1] = head_t1;
+ tail_prod[1] = tail_t1;
+ }
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum1[0];
+ tail_a0 = tail_sum1[0];
+ head_a1 = head_sum1[1];
+ tail_a1 = tail_sum1[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a0 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[0] = head_t1;
+ tail_tmp1[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a0 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[1] = head_t1;
+ tail_tmp1[1] = tail_t1;
+ }
+
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum2[0];
+ tail_a0 = tail_sum2[0];
+ head_a1 = head_sum2[1];
+ tail_a1 = tail_sum2[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a0 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[0] = head_t1;
+ tail_tmp2[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a0 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[1] = head_t1;
+ tail_tmp2[1] = tail_t1;
+ }
+
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp1[0];
+ tail_a = tail_tmp1[0];
+ head_b = head_tmp2[0];
+ tail_b = tail_tmp2[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[0] = head_t;
+ tail_tmp3[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_tmp1[1];
+ tail_a = tail_tmp1[1];
+ head_b = head_tmp2[1];
+ tail_b = tail_tmp2[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[1] = head_t;
+ tail_tmp3[1] = tail_t;
+ }
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[0] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[1] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[0] = head_t1;
+ tail_tmp4[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[1] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[0] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[1] = head_t1;
+ tail_tmp4[1] = tail_t1;
+ }
+ {
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp4[0];
+ tail_a = tail_tmp4[0];
+ head_b = head_tmp3[0];
+ tail_b = tail_tmp3[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[0] = t1 + t2;
+ }
+ /* Imaginary part */
+ head_a = head_tmp4[1];
+ tail_a = tail_tmp4[1];
+ head_b = head_tmp3[1];
+ tail_b = tail_tmp3[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[1] = t1 + t2;
+ }
+ }
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_zgbmv2_c_z_x */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d-f2c.c
new file mode 100644
index 00000000..cb517912
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d-f2c.c
@@ -0,0 +1,18 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_d_d(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const double *a, int lda, const double *head_x,
+ const double *tail_x, int incx, const void *beta,
+ void *y, int incy);
+
+
+extern void FC_FUNC_(blas_zgbmv2_d_d, BLAS_ZGBMV2_D_D)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const double *a, int *lda, const double *head_x, const double *tail_x,
+ int *incx, const void *beta, void *y, int *incy) {
+ BLAS_zgbmv2_d_d(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d.c
new file mode 100644
index 00000000..6c59b71d
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d.c
@@ -0,0 +1,270 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_d_d(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const double *a, int lda, const double *head_x,
+ const double *tail_x, int incx, const void *beta,
+ void *y, int incy)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) double*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) double*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_d_d";
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1;
+ double sum2;
+ double prod;
+ double a_elem;
+ double x_elem;
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = x_elem * a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = x_elem * a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ tmp1[0] = alpha_i[0] * sum1;
+ tmp1[1] = alpha_i[1] * sum1;
+ }
+ {
+ tmp2[0] = alpha_i[0] * sum2;
+ tmp2[1] = alpha_i[1] * sum2;
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+} /* end BLAS_zgbmv2_d_d */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x-f2c.c
new file mode 100644
index 00000000..44031fb6
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_d_d_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const double *a, int lda, const double *head_x,
+ const double *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_zgbmv2_d_d_x, BLAS_ZGBMV2_D_D_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const double *a, int *lda, const double *head_x, const double *tail_x,
+ int *incx, const void *beta, void *y, int *incy, int *prec) {
+ BLAS_zgbmv2_d_d_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y,
+ *incy, (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x.c
new file mode 100644
index 00000000..99329c0e
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_d_x.c
@@ -0,0 +1,896 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_d_d_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const double *a, int lda, const double *head_x,
+ const double *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) double*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) double*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_d_d_x";
+
+ switch (prec) {
+ case blas_prec_single:
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1;
+ double sum2;
+ double prod;
+ double a_elem;
+ double x_elem;
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1 = 0.0;
+ sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ prod = x_elem * a_elem;
+ sum1 = sum1 + prod;
+ x_elem = tail_x_i[jx];
+ prod = x_elem * a_elem;
+ sum2 = sum2 + prod;
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ tmp1[0] = alpha_i[0] * sum1;
+ tmp1[1] = alpha_i[1] * sum1;
+ }
+ {
+ tmp2[0] = alpha_i[0] * sum2;
+ tmp2[1] = alpha_i[1] * sum2;
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double head_tmp1[2], tail_tmp1[2];
+ double head_tmp2[2], tail_tmp2[2];
+ double head_tmp3[2], tail_tmp3[2];
+ double head_tmp4[2], tail_tmp4[2];
+ double result[2];
+ double head_sum1, tail_sum1;
+ double head_sum2, tail_sum2;
+ double head_prod, tail_prod;
+ double a_elem;
+ double x_elem;
+ double y_elem[2];
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1 = tail_sum1 = 0.0;
+ head_sum2 = tail_sum2 = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem = a_i[aij];
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = a_elem * split;
+ b1 = con - a_elem;
+ b1 = con - b1;
+ b2 = a_elem - b1;
+
+ head_prod = x_elem * a_elem;
+ tail_prod =
+ (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum1 + head_prod;
+ bv = s1 - head_sum1;
+ s2 = ((head_prod - bv) + (head_sum1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum1 + tail_prod;
+ bv = t1 - tail_sum1;
+ t2 = ((tail_prod - bv) + (tail_sum1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum1 = t1 + t2;
+ tail_sum1 = t2 - (head_sum1 - t1);
+ }
+ x_elem = tail_x_i[jx];
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = a_elem * split;
+ b1 = con - a_elem;
+ b1 = con - b1;
+ b2 = a_elem - b1;
+
+ head_prod = x_elem * a_elem;
+ tail_prod =
+ (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_sum2 + head_prod;
+ bv = s1 - head_sum2;
+ s2 = ((head_prod - bv) + (head_sum2 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_sum2 + tail_prod;
+ bv = t1 - tail_sum2;
+ t2 = ((tail_prod - bv) + (tail_sum2 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_sum2 = t1 + t2;
+ tail_sum2 = t2 - (head_sum2 - t1);
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ /* Compute complex-extra = complex-double * real. */
+ double head_t, tail_t;
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum1 * split;
+ a11 = con - head_sum1;
+ a11 = con - a11;
+ a21 = head_sum1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_sum1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp1[0] = head_t;
+ tail_tmp1[0] = tail_t;
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum1 * split;
+ a11 = con - head_sum1;
+ a11 = con - a11;
+ a21 = head_sum1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_sum1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp1[1] = head_t;
+ tail_tmp1[1] = tail_t;
+ }
+ {
+ /* Compute complex-extra = complex-double * real. */
+ double head_t, tail_t;
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum2 * split;
+ a11 = con - head_sum2;
+ a11 = con - a11;
+ a21 = head_sum2 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_sum2 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum2 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp2[0] = head_t;
+ tail_tmp2[0] = tail_t;
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_sum2 * split;
+ a11 = con - head_sum2;
+ a11 = con - a11;
+ a21 = head_sum2 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_sum2 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_sum2 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp2[1] = head_t;
+ tail_tmp2[1] = tail_t;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp1[0];
+ tail_a = tail_tmp1[0];
+ head_b = head_tmp2[0];
+ tail_b = tail_tmp2[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[0] = head_t;
+ tail_tmp3[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_tmp1[1];
+ tail_a = tail_tmp1[1];
+ head_b = head_tmp2[1];
+ tail_b = tail_tmp2[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[1] = head_t;
+ tail_tmp3[1] = tail_t;
+ }
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[0] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[1] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[0] = head_t1;
+ tail_tmp4[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[1] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[0] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[1] = head_t1;
+ tail_tmp4[1] = tail_t1;
+ }
+ {
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp4[0];
+ tail_a = tail_tmp4[0];
+ head_b = head_tmp3[0];
+ tail_b = tail_tmp3[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[0] = t1 + t2;
+ }
+ /* Imaginary part */
+ head_a = head_tmp4[1];
+ tail_a = tail_tmp4[1];
+ head_b = head_tmp3[1];
+ tail_b = tail_tmp3[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[1] = t1 + t2;
+ }
+ }
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_zgbmv2_d_d_x */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z-f2c.c
new file mode 100644
index 00000000..b034cc27
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z-f2c.c
@@ -0,0 +1,18 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_d_z(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const double *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy);
+
+
+extern void FC_FUNC_(blas_zgbmv2_d_z, BLAS_ZGBMV2_D_Z)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const double *a, int *lda, const void *head_x, const void *tail_x,
+ int *incx, const void *beta, void *y, int *incy) {
+ BLAS_zgbmv2_d_z(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z.c
new file mode 100644
index 00000000..6066a47f
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z.c
@@ -0,0 +1,280 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_d_z(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const double *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) double*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) void*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_d_z";
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = a;
+ const double *head_x_i = (double *) head_x;
+ const double *tail_x_i = (double *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ double a_elem;
+ double x_elem[2];
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem = a_i[aij];
+ {
+ prod[0] = x_elem[0] * a_elem;
+ prod[1] = x_elem[1] * a_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] = x_elem[0] * a_elem;
+ prod[1] = x_elem[1] * a_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ tmp1[0] = (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] = (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] = (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] = (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+} /* end BLAS_zgbmv2_d_z */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x-f2c.c
new file mode 100644
index 00000000..f9bb4d1c
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_d_z_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const double *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_zgbmv2_d_z_x, BLAS_ZGBMV2_D_Z_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const double *a, int *lda, const void *head_x, const void *tail_x,
+ int *incx, const void *beta, void *y, int *incy, int *prec) {
+ BLAS_zgbmv2_d_z_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y,
+ *incy, (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x.c
new file mode 100644
index 00000000..996fa4dc
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_d_z_x.c
@@ -0,0 +1,1274 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_d_z_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const double *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) double*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) void*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_d_z_x";
+
+ switch (prec) {
+ case blas_prec_single:
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = a;
+ const double *head_x_i = (double *) head_x;
+ const double *tail_x_i = (double *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ double a_elem;
+ double x_elem[2];
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem = a_i[aij];
+ {
+ prod[0] = x_elem[0] * a_elem;
+ prod[1] = x_elem[1] * a_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] = x_elem[0] * a_elem;
+ prod[1] = x_elem[1] * a_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ tmp1[0] =
+ (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] =
+ (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] =
+ (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] =
+ (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = a;
+ const double *head_x_i = (double *) head_x;
+ const double *tail_x_i = (double *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double head_tmp1[2], tail_tmp1[2];
+ double head_tmp2[2], tail_tmp2[2];
+ double head_tmp3[2], tail_tmp3[2];
+ double head_tmp4[2], tail_tmp4[2];
+ double result[2];
+ double head_sum1[2], tail_sum1[2];
+ double head_sum2[2], tail_sum2[2];
+ double head_prod[2], tail_prod[2];
+ double a_elem;
+ double x_elem[2];
+ double y_elem[2];
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+
+
+
+
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0;
+ head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem = a_i[aij];
+ {
+ /* Compute complex-extra = complex-double * real. */
+ double head_t, tail_t;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem * split;
+ a1 = con - a_elem;
+ a1 = con - a1;
+ a2 = a_elem - a1;
+ con = x_elem[0] * split;
+ b1 = con - x_elem[0];
+ b1 = con - b1;
+ b2 = x_elem[0] - b1;
+
+ head_t = a_elem * x_elem[0];
+ tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_prod[0] = head_t;
+ tail_prod[0] = tail_t;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem * split;
+ a1 = con - a_elem;
+ a1 = con - a1;
+ a2 = a_elem - a1;
+ con = x_elem[1] * split;
+ b1 = con - x_elem[1];
+ b1 = con - b1;
+ b2 = x_elem[1] - b1;
+
+ head_t = a_elem * x_elem[1];
+ tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_prod[1] = head_t;
+ tail_prod[1] = tail_t;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ /* Compute complex-extra = complex-double * real. */
+ double head_t, tail_t;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem * split;
+ a1 = con - a_elem;
+ a1 = con - a1;
+ a2 = a_elem - a1;
+ con = x_elem[0] * split;
+ b1 = con - x_elem[0];
+ b1 = con - b1;
+ b2 = x_elem[0] - b1;
+
+ head_t = a_elem * x_elem[0];
+ tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_prod[0] = head_t;
+ tail_prod[0] = tail_t;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem * split;
+ a1 = con - a_elem;
+ a1 = con - a1;
+ a2 = a_elem - a1;
+ con = x_elem[1] * split;
+ b1 = con - x_elem[1];
+ b1 = con - b1;
+ b2 = x_elem[1] - b1;
+
+ head_t = a_elem * x_elem[1];
+ tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_prod[1] = head_t;
+ tail_prod[1] = tail_t;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum1[0];
+ tail_a0 = tail_sum1[0];
+ head_a1 = head_sum1[1];
+ tail_a1 = tail_sum1[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a0 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[0] = head_t1;
+ tail_tmp1[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a0 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[1] = head_t1;
+ tail_tmp1[1] = tail_t1;
+ }
+
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum2[0];
+ tail_a0 = tail_sum2[0];
+ head_a1 = head_sum2[1];
+ tail_a1 = tail_sum2[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a0 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[0] = head_t1;
+ tail_tmp2[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a0 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[1] = head_t1;
+ tail_tmp2[1] = tail_t1;
+ }
+
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp1[0];
+ tail_a = tail_tmp1[0];
+ head_b = head_tmp2[0];
+ tail_b = tail_tmp2[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[0] = head_t;
+ tail_tmp3[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_tmp1[1];
+ tail_a = tail_tmp1[1];
+ head_b = head_tmp2[1];
+ tail_b = tail_tmp2[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[1] = head_t;
+ tail_tmp3[1] = tail_t;
+ }
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[0] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[1] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[0] = head_t1;
+ tail_tmp4[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[1] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[0] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[1] = head_t1;
+ tail_tmp4[1] = tail_t1;
+ }
+ {
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp4[0];
+ tail_a = tail_tmp4[0];
+ head_b = head_tmp3[0];
+ tail_b = tail_tmp3[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[0] = t1 + t2;
+ }
+ /* Imaginary part */
+ head_a = head_tmp4[1];
+ tail_a = tail_tmp4[1];
+ head_b = head_tmp3[1];
+ tail_b = tail_tmp3[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[1] = t1 + t2;
+ }
+ }
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_zgbmv2_d_z_x */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_x-f2c.c
new file mode 100644
index 00000000..17af6ea7
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_zgbmv2_x, BLAS_ZGBMV2_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const void *a, int *lda, const void *head_x, const void *tail_x, int *incx,
+ const void *beta, void *y, int *incy, int *prec) {
+ BLAS_zgbmv2_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy,
+ (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_x.c
new file mode 100644
index 00000000..98cf4658
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_x.c
@@ -0,0 +1,1936 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) void*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) void*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_x";
+
+ switch (prec) {
+ case blas_prec_single:
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = (double *) a;
+ const double *head_x_i = (double *) head_x;
+ const double *tail_x_i = (double *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ double a_elem[2];
+ double x_elem[2];
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] =
+ (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] =
+ (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] =
+ (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] =
+ (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = (double *) a;
+ const double *head_x_i = (double *) head_x;
+ const double *tail_x_i = (double *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double head_tmp1[2], tail_tmp1[2];
+ double head_tmp2[2], tail_tmp2[2];
+ double head_tmp3[2], tail_tmp3[2];
+ double head_tmp4[2], tail_tmp4[2];
+ double result[2];
+ double head_sum1[2], tail_sum1[2];
+ double head_sum2[2], tail_sum2[2];
+ double head_prod[2], tail_prod[2];
+ double a_elem[2];
+ double x_elem[2];
+ double y_elem[2];
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0;
+ head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = a_elem[0] * split;
+ b1 = con - a_elem[0];
+ b1 = con - b1;
+ b2 = a_elem[0] - b1;
+
+ head_t1 = x_elem[0] * a_elem[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = a_elem[1] * split;
+ b1 = con - a_elem[1];
+ b1 = con - b1;
+ b2 = a_elem[1] - b1;
+
+ head_t2 = x_elem[1] * a_elem[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[0] = head_t1;
+ tail_prod[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = a_elem[0] * split;
+ b1 = con - a_elem[0];
+ b1 = con - b1;
+ b2 = a_elem[0] - b1;
+
+ head_t1 = x_elem[1] * a_elem[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = a_elem[1] * split;
+ b1 = con - a_elem[1];
+ b1 = con - b1;
+ b2 = a_elem[1] - b1;
+
+ head_t2 = x_elem[0] * a_elem[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[1] = head_t1;
+ tail_prod[1] = tail_t1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = a_elem[0] * split;
+ b1 = con - a_elem[0];
+ b1 = con - b1;
+ b2 = a_elem[0] - b1;
+
+ head_t1 = x_elem[0] * a_elem[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = a_elem[1] * split;
+ b1 = con - a_elem[1];
+ b1 = con - b1;
+ b2 = a_elem[1] - b1;
+
+ head_t2 = x_elem[1] * a_elem[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[0] = head_t1;
+ tail_prod[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = a_elem[0] * split;
+ b1 = con - a_elem[0];
+ b1 = con - b1;
+ b2 = a_elem[0] - b1;
+
+ head_t1 = x_elem[1] * a_elem[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = a_elem[1] * split;
+ b1 = con - a_elem[1];
+ b1 = con - b1;
+ b2 = a_elem[1] - b1;
+
+ head_t2 = x_elem[0] * a_elem[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[1] = head_t1;
+ tail_prod[1] = tail_t1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = a_elem[0] * split;
+ b1 = con - a_elem[0];
+ b1 = con - b1;
+ b2 = a_elem[0] - b1;
+
+ head_t1 = x_elem[0] * a_elem[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = a_elem[1] * split;
+ b1 = con - a_elem[1];
+ b1 = con - b1;
+ b2 = a_elem[1] - b1;
+
+ head_t2 = x_elem[1] * a_elem[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[0] = head_t1;
+ tail_prod[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = a_elem[0] * split;
+ b1 = con - a_elem[0];
+ b1 = con - b1;
+ b2 = a_elem[0] - b1;
+
+ head_t1 = x_elem[1] * a_elem[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = a_elem[1] * split;
+ b1 = con - a_elem[1];
+ b1 = con - b1;
+ b2 = a_elem[1] - b1;
+
+ head_t2 = x_elem[0] * a_elem[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[1] = head_t1;
+ tail_prod[1] = tail_t1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = a_elem[0] * split;
+ b1 = con - a_elem[0];
+ b1 = con - b1;
+ b2 = a_elem[0] - b1;
+
+ head_t1 = x_elem[0] * a_elem[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = a_elem[1] * split;
+ b1 = con - a_elem[1];
+ b1 = con - b1;
+ b2 = a_elem[1] - b1;
+
+ head_t2 = x_elem[1] * a_elem[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[0] = head_t1;
+ tail_prod[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[1] * split;
+ a1 = con - x_elem[1];
+ a1 = con - a1;
+ a2 = x_elem[1] - a1;
+ con = a_elem[0] * split;
+ b1 = con - a_elem[0];
+ b1 = con - b1;
+ b2 = a_elem[0] - b1;
+
+ head_t1 = x_elem[1] * a_elem[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem[0] * split;
+ a1 = con - x_elem[0];
+ a1 = con - a1;
+ a2 = x_elem[0] - a1;
+ con = a_elem[1] * split;
+ b1 = con - a_elem[1];
+ b1 = con - b1;
+ b2 = a_elem[1] - b1;
+
+ head_t2 = x_elem[0] * a_elem[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[1] = head_t1;
+ tail_prod[1] = tail_t1;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum1[0];
+ tail_a0 = tail_sum1[0];
+ head_a1 = head_sum1[1];
+ tail_a1 = tail_sum1[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a0 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[0] = head_t1;
+ tail_tmp1[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a0 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[1] = head_t1;
+ tail_tmp1[1] = tail_t1;
+ }
+
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum2[0];
+ tail_a0 = tail_sum2[0];
+ head_a1 = head_sum2[1];
+ tail_a1 = tail_sum2[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a0 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[0] = head_t1;
+ tail_tmp2[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a0 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[1] = head_t1;
+ tail_tmp2[1] = tail_t1;
+ }
+
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp1[0];
+ tail_a = tail_tmp1[0];
+ head_b = head_tmp2[0];
+ tail_b = tail_tmp2[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[0] = head_t;
+ tail_tmp3[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_tmp1[1];
+ tail_a = tail_tmp1[1];
+ head_b = head_tmp2[1];
+ tail_b = tail_tmp2[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[1] = head_t;
+ tail_tmp3[1] = tail_t;
+ }
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[0] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[1] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[0] = head_t1;
+ tail_tmp4[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[1] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[0] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[1] = head_t1;
+ tail_tmp4[1] = tail_t1;
+ }
+ {
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp4[0];
+ tail_a = tail_tmp4[0];
+ head_b = head_tmp3[0];
+ tail_b = tail_tmp3[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[0] = t1 + t2;
+ }
+ /* Imaginary part */
+ head_a = head_tmp4[1];
+ tail_a = tail_tmp4[1];
+ head_b = head_tmp3[1];
+ tail_b = tail_tmp3[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[1] = t1 + t2;
+ }
+ }
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_zgbmv2_x */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c-f2c.c
new file mode 100644
index 00000000..e51d9551
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c-f2c.c
@@ -0,0 +1,18 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_z_c(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy);
+
+
+extern void FC_FUNC_(blas_zgbmv2_z_c, BLAS_ZGBMV2_Z_C)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const void *a, int *lda, const void *head_x, const void *tail_x, int *incx,
+ const void *beta, void *y, int *incy) {
+ BLAS_zgbmv2_z_c(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c.c
new file mode 100644
index 00000000..6581d257
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c.c
@@ -0,0 +1,314 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_z_c(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) void*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) void*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_z_c";
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = (double *) a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ double a_elem[2];
+ float x_elem[2];
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] - (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] + (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] = (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] = (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] = (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] = (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+} /* end BLAS_zgbmv2_z_c */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x-f2c.c
new file mode 100644
index 00000000..45dbc677
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_z_c_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_zgbmv2_z_c_x, BLAS_ZGBMV2_Z_C_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const void *a, int *lda, const void *head_x, const void *tail_x, int *incx,
+ const void *beta, void *y, int *incy, int *prec) {
+ BLAS_zgbmv2_z_c_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y,
+ *incy, (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x.c
new file mode 100644
index 00000000..5b0f1841
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_c_x.c
@@ -0,0 +1,1956 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_z_c_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const void *head_x,
+ const void *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) void*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) void*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_z_c_x";
+
+ switch (prec) {
+ case blas_prec_single:
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = (double *) a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ double a_elem[2];
+ float x_elem[2];
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ prod[0] =
+ (double) x_elem[0] * a_elem[0] -
+ (double) x_elem[1] * a_elem[1];
+ prod[1] =
+ (double) x_elem[0] * a_elem[1] +
+ (double) x_elem[1] * a_elem[0];
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] =
+ (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] =
+ (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] =
+ (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] =
+ (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = (double *) a;
+ const float *head_x_i = (float *) head_x;
+ const float *tail_x_i = (float *) tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double head_tmp1[2], tail_tmp1[2];
+ double head_tmp2[2], tail_tmp2[2];
+ double head_tmp3[2], tail_tmp3[2];
+ double head_tmp4[2], tail_tmp4[2];
+ double result[2];
+ double head_sum1[2], tail_sum1[2];
+ double head_sum2[2], tail_sum2[2];
+ double head_prod[2], tail_prod[2];
+ double a_elem[2];
+ float x_elem[2];
+ double y_elem[2];
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+ incx *= 2;
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+ ix0 *= 2;
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0;
+ head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ double cd[2];
+ cd[0] = (double) x_elem[0];
+ cd[1] = (double) x_elem[1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[0] * split;
+ a1 = con - a_elem[0];
+ a1 = con - a1;
+ a2 = a_elem[0] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = a_elem[0] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[1] * split;
+ a1 = con - a_elem[1];
+ a1 = con - a1;
+ a2 = a_elem[1] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = a_elem[1] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[0] = head_t1;
+ tail_prod[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[1] * split;
+ a1 = con - a_elem[1];
+ a1 = con - a1;
+ a2 = a_elem[1] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = a_elem[1] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[0] * split;
+ a1 = con - a_elem[0];
+ a1 = con - a1;
+ a2 = a_elem[0] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = a_elem[0] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[1] = head_t1;
+ tail_prod[1] = tail_t1;
+ }
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ double cd[2];
+ cd[0] = (double) x_elem[0];
+ cd[1] = (double) x_elem[1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[0] * split;
+ a1 = con - a_elem[0];
+ a1 = con - a1;
+ a2 = a_elem[0] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = a_elem[0] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[1] * split;
+ a1 = con - a_elem[1];
+ a1 = con - a1;
+ a2 = a_elem[1] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = a_elem[1] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[0] = head_t1;
+ tail_prod[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[1] * split;
+ a1 = con - a_elem[1];
+ a1 = con - a1;
+ a2 = a_elem[1] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = a_elem[1] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[0] * split;
+ a1 = con - a_elem[0];
+ a1 = con - a1;
+ a2 = a_elem[0] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = a_elem[0] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[1] = head_t1;
+ tail_prod[1] = tail_t1;
+ }
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem[0] = head_x_i[jx];
+ x_elem[1] = head_x_i[jx + 1];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ double cd[2];
+ cd[0] = (double) x_elem[0];
+ cd[1] = (double) x_elem[1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[0] * split;
+ a1 = con - a_elem[0];
+ a1 = con - a1;
+ a2 = a_elem[0] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = a_elem[0] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[1] * split;
+ a1 = con - a_elem[1];
+ a1 = con - a1;
+ a2 = a_elem[1] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = a_elem[1] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[0] = head_t1;
+ tail_prod[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[1] * split;
+ a1 = con - a_elem[1];
+ a1 = con - a1;
+ a2 = a_elem[1] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = a_elem[1] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[0] * split;
+ a1 = con - a_elem[0];
+ a1 = con - a1;
+ a2 = a_elem[0] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = a_elem[0] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[1] = head_t1;
+ tail_prod[1] = tail_t1;
+ }
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem[0] = tail_x_i[jx];
+ x_elem[1] = tail_x_i[jx + 1];
+ {
+ double cd[2];
+ cd[0] = (double) x_elem[0];
+ cd[1] = (double) x_elem[1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[0] * split;
+ a1 = con - a_elem[0];
+ a1 = con - a1;
+ a2 = a_elem[0] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = a_elem[0] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[1] * split;
+ a1 = con - a_elem[1];
+ a1 = con - a1;
+ a2 = a_elem[1] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = a_elem[1] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[0] = head_t1;
+ tail_prod[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[1] * split;
+ a1 = con - a_elem[1];
+ a1 = con - a1;
+ a2 = a_elem[1] - a1;
+ con = cd[0] * split;
+ b1 = con - cd[0];
+ b1 = con - b1;
+ b2 = cd[0] - b1;
+
+ head_t1 = a_elem[1] * cd[0];
+ tail_t1 =
+ (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = a_elem[0] * split;
+ a1 = con - a_elem[0];
+ a1 = con - a1;
+ a2 = a_elem[0] - a1;
+ con = cd[1] * split;
+ b1 = con - cd[1];
+ b1 = con - b1;
+ b2 = cd[1] - b1;
+
+ head_t2 = a_elem[0] * cd[1];
+ tail_t2 =
+ (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_prod[1] = head_t1;
+ tail_prod[1] = tail_t1;
+ }
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum1[0];
+ tail_a0 = tail_sum1[0];
+ head_a1 = head_sum1[1];
+ tail_a1 = tail_sum1[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a0 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[0] = head_t1;
+ tail_tmp1[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a0 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[1] = head_t1;
+ tail_tmp1[1] = tail_t1;
+ }
+
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum2[0];
+ tail_a0 = tail_sum2[0];
+ head_a1 = head_sum2[1];
+ tail_a1 = tail_sum2[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a0 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[0] = head_t1;
+ tail_tmp2[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a0 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[1] = head_t1;
+ tail_tmp2[1] = tail_t1;
+ }
+
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp1[0];
+ tail_a = tail_tmp1[0];
+ head_b = head_tmp2[0];
+ tail_b = tail_tmp2[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[0] = head_t;
+ tail_tmp3[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_tmp1[1];
+ tail_a = tail_tmp1[1];
+ head_b = head_tmp2[1];
+ tail_b = tail_tmp2[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[1] = head_t;
+ tail_tmp3[1] = tail_t;
+ }
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[0] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[1] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[0] = head_t1;
+ tail_tmp4[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[1] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[0] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[1] = head_t1;
+ tail_tmp4[1] = tail_t1;
+ }
+ {
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp4[0];
+ tail_a = tail_tmp4[0];
+ head_b = head_tmp3[0];
+ tail_b = tail_tmp3[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[0] = t1 + t2;
+ }
+ /* Imaginary part */
+ head_a = head_tmp4[1];
+ tail_a = tail_tmp4[1];
+ head_b = head_tmp3[1];
+ tail_b = tail_tmp3[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[1] = t1 + t2;
+ }
+ }
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_zgbmv2_z_c_x */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d-f2c.c
new file mode 100644
index 00000000..9d8451e7
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d-f2c.c
@@ -0,0 +1,18 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_z_d(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const double *head_x,
+ const double *tail_x, int incx, const void *beta,
+ void *y, int incy);
+
+
+extern void FC_FUNC_(blas_zgbmv2_z_d, BLAS_ZGBMV2_Z_D)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const void *a, int *lda, const double *head_x, const double *tail_x,
+ int *incx, const void *beta, void *y, int *incy) {
+ BLAS_zgbmv2_z_d(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y, *incy);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d.c
new file mode 100644
index 00000000..3a9e537f
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d.c
@@ -0,0 +1,302 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_z_d(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const double *head_x,
+ const double *tail_x, int incx, const void *beta,
+ void *y, int incy)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) void*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) double*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_z_d";
+
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = (double *) a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ double a_elem[2];
+ double x_elem;
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem = tail_x_i[jx];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem = tail_x_i[jx];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] = (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] = (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] = (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] = (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+
+} /* end BLAS_zgbmv2_z_d */
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x-f2c.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x-f2c.c
new file mode 100644
index 00000000..fad23ee6
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x-f2c.c
@@ -0,0 +1,19 @@
+
+#include "f2c-bridge.h"
+#include "blas_enum.h"
+void BLAS_zgbmv2_z_d_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const double *head_x,
+ const double *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec);
+
+
+extern void FC_FUNC_(blas_zgbmv2_z_d_x, BLAS_ZGBMV2_Z_D_X)
+
+ (int *trans, int *m, int *n, int *kl, int *ku, const void *alpha,
+ const void *a, int *lda, const double *head_x, const double *tail_x,
+ int *incx, const void *beta, void *y, int *incy, int *prec) {
+ BLAS_zgbmv2_z_d_x(blas_colmajor, (enum blas_trans_type) *trans, *m, *n, *kl,
+ *ku, alpha, a, *lda, head_x, tail_x, *incx, beta, y,
+ *incy, (enum blas_prec_type) *prec);
+}
diff --git a/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x.c b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x.c
new file mode 100644
index 00000000..df3e530f
--- /dev/null
+++ b/XBLAS/src/gbmv2/BLAS_zgbmv2_z_d_x.c
@@ -0,0 +1,1532 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+void BLAS_zgbmv2_z_d_x(enum blas_order_type order, enum blas_trans_type trans,
+ int m, int n, int kl, int ku, const void *alpha,
+ const void *a, int lda, const double *head_x,
+ const double *tail_x, int incx, const void *beta,
+ void *y, int incy, enum blas_prec_type prec)
+
+/*
+ * Purpose
+ * =======
+ *
+ * This routines computes the matrix product:
+ *
+ * y <- alpha * op(A) * (x_head + x_tail) + beta * y
+ *
+ * where
+ *
+ * A is a m x n banded matrix
+ * x is a n x 1 vector
+ * y is a m x 1 vector
+ * alpha and beta are scalars
+ *
+ * Arguments
+ * =========
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Transpose of AB; no trans,
+ * trans, or conjugate trans
+ *
+ * m (input) int
+ * Dimension of AB
+ *
+ * n (input) int
+ * Dimension of AB and the length of vector x and z
+ *
+ * kl (input) int
+ * Number of lower diagnols of AB
+ *
+ * ku (input) int
+ * Number of upper diagnols of AB
+ *
+ * alpha (input) const void*
+ *
+ * AB (input) void*
+ *
+ * lda (input) int
+ * Leading dimension of AB
+ * lda >= ku + kl + 1
+ *
+ * head_x
+ * tail_x (input) double*
+ *
+ * incx (input) int
+ * The stride for vector x.
+ *
+ * beta (input) const void*
+ *
+ * y (input) const void*
+ *
+ * incy (input) int
+ * The stride for vector y.
+ *
+ * prec (input) enum blas_prec_type
+ * Specifies the internal precision to be used.
+ * = blas_prec_single: single precision.
+ * = blas_prec_double: double precision.
+ * = blas_prec_extra : anything at least 1.5 times as accurate
+ * than double, and wider than 80-bits.
+ * We use double-double in our implementation.
+ *
+ *
+ * LOCAL VARIABLES
+ * ===============
+ *
+ * As an example, these variables are described on the mxn, column
+ * major, banded matrix described in section 2.2.3 of the specification
+ *
+ * astart indexes first element in A where computation begins
+ *
+ * incai1 indexes first element in row where row is less than lbound
+ *
+ * incai2 indexes first element in row where row exceeds lbound
+ *
+ * lbound denotes the number of rows before first element shifts
+ *
+ * rbound denotes the columns where there is blank space
+ *
+ * ra index of the rightmost element for a given row
+ *
+ * la index of leftmost elements for a given row
+ *
+ * ra - la width of a row
+ *
+ * rbound
+ * la ra ____|_____
+ * | | | |
+ * | a00 a01 * * *
+ * lbound -| a10 a11 a12 * *
+ * | a20 a21 a22 a23 *
+ * * a31 a32 a33 a34
+ * * * a42 a43 a44
+ *
+ * Varations on order and transpose have been implemented by modifying these
+ * local variables.
+ *
+ */
+{
+ static const char routine_name[] = "BLAS_zgbmv2_z_d_x";
+
+ switch (prec) {
+ case blas_prec_single:
+ case blas_prec_double:
+ case blas_prec_indigenous:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = (double *) a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double tmp1[2];
+ double tmp2[2];
+ double tmp3[2];
+ double tmp4[2];
+ double result[2];
+ double sum1[2];
+ double sum2[2];
+ double prod[2];
+ double a_elem[2];
+ double x_elem;
+ double y_elem[2];
+
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ sum1[0] = sum1[1] = 0.0;
+ sum2[0] = sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem = tail_x_i[jx];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum1[0] = sum1[0] + prod[0];
+ sum1[1] = sum1[1] + prod[1];
+ x_elem = tail_x_i[jx];
+ {
+ prod[0] = a_elem[0] * x_elem;
+ prod[1] = a_elem[1] * x_elem;
+ }
+ sum2[0] = sum2[0] + prod[0];
+ sum2[1] = sum2[1] + prod[1];
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ tmp1[0] =
+ (double) sum1[0] * alpha_i[0] - (double) sum1[1] * alpha_i[1];
+ tmp1[1] =
+ (double) sum1[0] * alpha_i[1] + (double) sum1[1] * alpha_i[0];
+ }
+ {
+ tmp2[0] =
+ (double) sum2[0] * alpha_i[0] - (double) sum2[1] * alpha_i[1];
+ tmp2[1] =
+ (double) sum2[0] * alpha_i[1] + (double) sum2[1] * alpha_i[0];
+ }
+ tmp3[0] = tmp1[0] + tmp2[0];
+ tmp3[1] = tmp1[1] + tmp2[1];
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ tmp4[0] =
+ (double) beta_i[0] * y_elem[0] - (double) beta_i[1] * y_elem[1];
+ tmp4[1] =
+ (double) beta_i[0] * y_elem[1] + (double) beta_i[1] * y_elem[0];
+ }
+ result[0] = tmp4[0] + tmp3[0];
+ result[1] = tmp4[1] + tmp3[1];
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+
+ }
+ break;
+ case blas_prec_extra:
+ {
+ int iy0, iy, ix0, jx, j, i, rbound, lbound, ra, la, lenx, leny;
+ int incaij, aij, incai1, incai2, astart, ai;
+ double *y_i = (double *) y;
+ const double *a_i = (double *) a;
+ const double *head_x_i = head_x;
+ const double *tail_x_i = tail_x;
+ double *alpha_i = (double *) alpha;
+ double *beta_i = (double *) beta;
+ double head_tmp1[2], tail_tmp1[2];
+ double head_tmp2[2], tail_tmp2[2];
+ double head_tmp3[2], tail_tmp3[2];
+ double head_tmp4[2], tail_tmp4[2];
+ double result[2];
+ double head_sum1[2], tail_sum1[2];
+ double head_sum2[2], tail_sum2[2];
+ double head_prod[2], tail_prod[2];
+ double a_elem[2];
+ double x_elem;
+ double y_elem[2];
+ FPU_FIX_DECL;
+
+ if (order != blas_colmajor && order != blas_rowmajor)
+ BLAS_error(routine_name, -1, order, NULL);
+ if (trans != blas_no_trans &&
+ trans != blas_trans && trans != blas_conj_trans) {
+ BLAS_error(routine_name, -2, trans, NULL);
+ }
+ if (m < 0)
+ BLAS_error(routine_name, -3, m, NULL);
+ if (n < 0)
+ BLAS_error(routine_name, -4, n, NULL);
+ if (kl < 0 || kl >= m)
+ BLAS_error(routine_name, -5, kl, NULL);
+ if (ku < 0 || ku >= n)
+ BLAS_error(routine_name, -6, ku, NULL);
+ if (lda < kl + ku + 1)
+ BLAS_error(routine_name, -9, lda, NULL);
+ if (incx == 0)
+ BLAS_error(routine_name, -12, incx, NULL);
+ if (incy == 0)
+ BLAS_error(routine_name, -15, incy, NULL);
+
+ if (m == 0 || n == 0)
+ return;
+ if ((alpha_i[0] == 0.0 && alpha_i[1] == 0.0)
+ && ((beta_i[0] == 1.0 && beta_i[1] == 0.0)))
+ return;
+
+ if (trans == blas_no_trans) {
+ lenx = n;
+ leny = m;
+ } else {
+ lenx = m;
+ leny = n;
+ }
+
+ ix0 = (incx > 0) ? 0 : -(lenx - 1) * incx;
+ iy0 = (incy > 0) ? 0 : -(leny - 1) * incy;
+
+ FPU_FIX_START;
+
+ /* if alpha = 0, return y = y*beta */
+ if ((order == blas_colmajor) && (trans == blas_no_trans)) {
+ astart = ku;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else if ((order == blas_colmajor) && (trans != blas_no_trans)) {
+ astart = ku;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ } else if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
+ astart = kl;
+ incai1 = lda - 1;
+ incai2 = lda;
+ incaij = 1;
+ lbound = kl;
+ rbound = n - ku - 1;
+ ra = ku;
+ } else { /* rowmajor and blas_trans */
+ astart = kl;
+ incai1 = 1;
+ incai2 = lda;
+ incaij = lda - 1;
+ lbound = ku;
+ rbound = m - kl - 1;
+ ra = kl;
+ }
+
+ incy *= 2;
+ incaij *= 2;
+ incai1 *= 2;
+ incai2 *= 2;
+ astart *= 2;
+ iy0 *= 2;
+
+
+ la = 0;
+ ai = astart;
+ iy = iy0;
+ for (i = 0; i < leny; i++) {
+ head_sum1[0] = head_sum1[1] = tail_sum1[0] = tail_sum1[1] = 0.0;
+ head_sum2[0] = head_sum2[1] = tail_sum2[0] = tail_sum2[1] = 0.0;
+ aij = ai;
+ jx = ix0;
+ if (trans != blas_conj_trans) {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ {
+ /* Compute complex-extra = complex-double * real. */
+ double head_t, tail_t;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = a_elem[0] * split;
+ b1 = con - a_elem[0];
+ b1 = con - b1;
+ b2 = a_elem[0] - b1;
+
+ head_t = x_elem * a_elem[0];
+ tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_prod[0] = head_t;
+ tail_prod[0] = tail_t;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = a_elem[1] * split;
+ b1 = con - a_elem[1];
+ b1 = con - b1;
+ b2 = a_elem[1] - b1;
+
+ head_t = x_elem * a_elem[1];
+ tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_prod[1] = head_t;
+ tail_prod[1] = tail_t;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem = tail_x_i[jx];
+ {
+ /* Compute complex-extra = complex-double * real. */
+ double head_t, tail_t;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = a_elem[0] * split;
+ b1 = con - a_elem[0];
+ b1 = con - b1;
+ b2 = a_elem[0] - b1;
+
+ head_t = x_elem * a_elem[0];
+ tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_prod[0] = head_t;
+ tail_prod[0] = tail_t;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = a_elem[1] * split;
+ b1 = con - a_elem[1];
+ b1 = con - b1;
+ b2 = a_elem[1] - b1;
+
+ head_t = x_elem * a_elem[1];
+ tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_prod[1] = head_t;
+ tail_prod[1] = tail_t;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+
+ } else {
+ for (j = ra - la; j >= 0; j--) {
+ x_elem = head_x_i[jx];
+ a_elem[0] = a_i[aij];
+ a_elem[1] = a_i[aij + 1];
+ a_elem[1] = -a_elem[1];
+ {
+ /* Compute complex-extra = complex-double * real. */
+ double head_t, tail_t;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = a_elem[0] * split;
+ b1 = con - a_elem[0];
+ b1 = con - b1;
+ b2 = a_elem[0] - b1;
+
+ head_t = x_elem * a_elem[0];
+ tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_prod[0] = head_t;
+ tail_prod[0] = tail_t;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = a_elem[1] * split;
+ b1 = con - a_elem[1];
+ b1 = con - b1;
+ b2 = a_elem[1] - b1;
+
+ head_t = x_elem * a_elem[1];
+ tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_prod[1] = head_t;
+ tail_prod[1] = tail_t;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum1[0];
+ tail_a = tail_sum1[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[0] = head_t;
+ tail_sum1[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum1[1];
+ tail_a = tail_sum1[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum1[1] = head_t;
+ tail_sum1[1] = tail_t;
+ }
+ x_elem = tail_x_i[jx];
+ {
+ /* Compute complex-extra = complex-double * real. */
+ double head_t, tail_t;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = a_elem[0] * split;
+ b1 = con - a_elem[0];
+ b1 = con - b1;
+ b2 = a_elem[0] - b1;
+
+ head_t = x_elem * a_elem[0];
+ tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_prod[0] = head_t;
+ tail_prod[0] = tail_t;
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = x_elem * split;
+ a1 = con - x_elem;
+ a1 = con - a1;
+ a2 = x_elem - a1;
+ con = a_elem[1] * split;
+ b1 = con - a_elem[1];
+ b1 = con - b1;
+ b2 = a_elem[1] - b1;
+
+ head_t = x_elem * a_elem[1];
+ tail_t = (((a1 * b1 - head_t) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_prod[1] = head_t;
+ tail_prod[1] = tail_t;
+ }
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_sum2[0];
+ tail_a = tail_sum2[0];
+ head_b = head_prod[0];
+ tail_b = tail_prod[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[0] = head_t;
+ tail_sum2[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_sum2[1];
+ tail_a = tail_sum2[1];
+ head_b = head_prod[1];
+ tail_b = tail_prod[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_sum2[1] = head_t;
+ tail_sum2[1] = tail_t;
+ }
+ aij += incaij;
+ jx += incx;
+ }
+ }
+
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum1[0];
+ tail_a0 = tail_sum1[0];
+ head_a1 = head_sum1[1];
+ tail_a1 = tail_sum1[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a0 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[0] = head_t1;
+ tail_tmp1[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a0 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp1[1] = head_t1;
+ tail_tmp1[1] = tail_t1;
+ }
+
+ {
+ /* Compute complex-extra = complex-extra * complex-double. */
+ double head_a0, tail_a0;
+ double head_a1, tail_a1;
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ head_a0 = head_sum2[0];
+ tail_a0 = tail_sum2[0];
+ head_a1 = head_sum2[1];
+ tail_a1 = tail_sum2[1];
+ /* real part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a0 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a1 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[0] = head_t1;
+ tail_tmp2[0] = tail_t1;
+ /* imaginary part */
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a1 * split;
+ a11 = con - head_a1;
+ a11 = con - a11;
+ a21 = head_a1 - a11;
+ con = alpha_i[0] * split;
+ b1 = con - alpha_i[0];
+ b1 = con - b1;
+ b2 = alpha_i[0] - b1;
+
+ c11 = head_a1 * alpha_i[0];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a1 * alpha_i[0];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ {
+ /* Compute double-double = double-double * double. */
+ double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
+
+ con = head_a0 * split;
+ a11 = con - head_a0;
+ a11 = con - a11;
+ a21 = head_a0 - a11;
+ con = alpha_i[1] * split;
+ b1 = con - alpha_i[1];
+ b1 = con - b1;
+ b2 = alpha_i[1] - b1;
+
+ c11 = head_a0 * alpha_i[1];
+ c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
+
+ c2 = tail_a0 * alpha_i[1];
+ t1 = c11 + c2;
+ t2 = (c2 - (t1 - c11)) + c21;
+
+ head_t2 = t1 + t2;
+ tail_t2 = t2 - (head_t2 - t1);
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp2[1] = head_t1;
+ tail_tmp2[1] = tail_t1;
+ }
+
+ {
+ double head_t, tail_t;
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp1[0];
+ tail_a = tail_tmp1[0];
+ head_b = head_tmp2[0];
+ tail_b = tail_tmp2[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[0] = head_t;
+ tail_tmp3[0] = tail_t;
+ /* Imaginary part */
+ head_a = head_tmp1[1];
+ tail_a = tail_tmp1[1];
+ head_b = head_tmp2[1];
+ tail_b = tail_tmp2[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t = t1 + t2;
+ tail_t = t2 - (head_t - t1);
+ }
+ head_tmp3[1] = head_t;
+ tail_tmp3[1] = tail_t;
+ }
+ y_elem[0] = y_i[iy];
+ y_elem[1] = y_i[iy + 1];
+ {
+ /* Compute complex-extra = complex-double * complex-double. */
+ double head_t1, tail_t1;
+ double head_t2, tail_t2;
+ /* Real part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[0] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[1] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ head_t2 = -head_t2;
+ tail_t2 = -tail_t2;
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[0] = head_t1;
+ tail_tmp4[0] = tail_t1;
+ /* Imaginary part */
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[1] * split;
+ a1 = con - beta_i[1];
+ a1 = con - a1;
+ a2 = beta_i[1] - a1;
+ con = y_elem[0] * split;
+ b1 = con - y_elem[0];
+ b1 = con - b1;
+ b2 = y_elem[0] - b1;
+
+ head_t1 = beta_i[1] * y_elem[0];
+ tail_t1 = (((a1 * b1 - head_t1) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double_double = double * double. */
+ double a1, a2, b1, b2, con;
+
+ con = beta_i[0] * split;
+ a1 = con - beta_i[0];
+ a1 = con - a1;
+ a2 = beta_i[0] - a1;
+ con = y_elem[1] * split;
+ b1 = con - y_elem[1];
+ b1 = con - b1;
+ b2 = y_elem[1] - b1;
+
+ head_t2 = beta_i[0] * y_elem[1];
+ tail_t2 = (((a1 * b1 - head_t2) + a1 * b2) + a2 * b1) + a2 * b2;
+ }
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_t1 + head_t2;
+ bv = s1 - head_t1;
+ s2 = ((head_t2 - bv) + (head_t1 - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_t1 + tail_t2;
+ bv = t1 - tail_t1;
+ t2 = ((tail_t2 - bv) + (tail_t1 - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ head_t1 = t1 + t2;
+ tail_t1 = t2 - (head_t1 - t1);
+ }
+ head_tmp4[1] = head_t1;
+ tail_tmp4[1] = tail_t1;
+ }
+ {
+ double head_a, tail_a;
+ double head_b, tail_b;
+ /* Real part */
+ head_a = head_tmp4[0];
+ tail_a = tail_tmp4[0];
+ head_b = head_tmp3[0];
+ tail_b = tail_tmp3[0];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[0] = t1 + t2;
+ }
+ /* Imaginary part */
+ head_a = head_tmp4[1];
+ tail_a = tail_tmp4[1];
+ head_b = head_tmp3[1];
+ tail_b = tail_tmp3[1];
+ {
+ /* Compute double-double = double-double + double-double. */
+ double bv;
+ double s1, s2, t1, t2;
+
+ /* Add two hi words. */
+ s1 = head_a + head_b;
+ bv = s1 - head_a;
+ s2 = ((head_b - bv) + (head_a - (s1 - bv)));
+
+ /* Add two lo words. */
+ t1 = tail_a + tail_b;
+ bv = t1 - tail_a;
+ t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
+
+ s2 += t1;
+
+ /* Renormalize (s1, s2) to (t1, s2) */
+ t1 = s1 + s2;
+ s2 = s2 - (t1 - s1);
+
+ t2 += s2;
+
+ /* Renormalize (t1, t2) */
+ result[1] = t1 + t2;
+ }
+ }
+ y_i[iy] = result[0];
+ y_i[iy + 1] = result[1];
+
+ iy += incy;
+ if (i >= lbound) {
+ ix0 += incx;
+ ai += incai2;
+ la++;
+ } else {
+ ai += incai1;
+ }
+ if (i < rbound) {
+ ra++;
+ }
+ }
+
+ FPU_FIX_STOP;
+ }
+ break;
+ }
+} /* end BLAS_zgbmv2_z_d_x */
diff --git a/XBLAS/src/gbmv2/Makefile b/XBLAS/src/gbmv2/Makefile
new file mode 100644
index 00000000..3138b90f
--- /dev/null
+++ b/XBLAS/src/gbmv2/Makefile
@@ -0,0 +1,80 @@
+include ../../make.conf
+include ../../$(MAKEINC)
+
+LIB_PATH = ../../$(OUTPUT_DIR)
+SRC_PATH = .
+HEADER_PATH = ..
+
+GBMV2_SRCS =\
+ $(SRC_PATH)/BLAS_dgbmv2_d_s.c\
+ $(SRC_PATH)/BLAS_dgbmv2_s_d.c\
+ $(SRC_PATH)/BLAS_dgbmv2_s_s.c\
+ $(SRC_PATH)/BLAS_zgbmv2_z_c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_c_z.c\
+ $(SRC_PATH)/BLAS_zgbmv2_c_c.c\
+ $(SRC_PATH)/BLAS_cgbmv2_c_s.c\
+ $(SRC_PATH)/BLAS_cgbmv2_s_c.c\
+ $(SRC_PATH)/BLAS_cgbmv2_s_s.c\
+ $(SRC_PATH)/BLAS_zgbmv2_z_d.c\
+ $(SRC_PATH)/BLAS_zgbmv2_d_z.c\
+ $(SRC_PATH)/BLAS_zgbmv2_d_d.c\
+ $(SRC_PATH)/BLAS_sgbmv2_x.c\
+ $(SRC_PATH)/BLAS_dgbmv2_x.c\
+ $(SRC_PATH)/BLAS_cgbmv2_x.c\
+ $(SRC_PATH)/BLAS_zgbmv2_x.c\
+ $(SRC_PATH)/BLAS_dgbmv2_d_s_x.c\
+ $(SRC_PATH)/BLAS_dgbmv2_s_d_x.c\
+ $(SRC_PATH)/BLAS_dgbmv2_s_s_x.c\
+ $(SRC_PATH)/BLAS_zgbmv2_z_c_x.c\
+ $(SRC_PATH)/BLAS_zgbmv2_c_z_x.c\
+ $(SRC_PATH)/BLAS_zgbmv2_c_c_x.c\
+ $(SRC_PATH)/BLAS_cgbmv2_c_s_x.c\
+ $(SRC_PATH)/BLAS_cgbmv2_s_c_x.c\
+ $(SRC_PATH)/BLAS_cgbmv2_s_s_x.c\
+ $(SRC_PATH)/BLAS_zgbmv2_z_d_x.c\
+ $(SRC_PATH)/BLAS_zgbmv2_d_z_x.c\
+ $(SRC_PATH)/BLAS_zgbmv2_d_d_x.c \
+ \
+ $(SRC_PATH)/BLAS_dgbmv2_d_s-f2c.c\
+ $(SRC_PATH)/BLAS_dgbmv2_s_d-f2c.c\
+ $(SRC_PATH)/BLAS_dgbmv2_s_s-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_z_c-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_c_z-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_c_c-f2c.c\
+ $(SRC_PATH)/BLAS_cgbmv2_c_s-f2c.c\
+ $(SRC_PATH)/BLAS_cgbmv2_s_c-f2c.c\
+ $(SRC_PATH)/BLAS_cgbmv2_s_s-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_z_d-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_d_z-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_d_d-f2c.c\
+ $(SRC_PATH)/BLAS_sgbmv2_x-f2c.c\
+ $(SRC_PATH)/BLAS_dgbmv2_x-f2c.c\
+ $(SRC_PATH)/BLAS_cgbmv2_x-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_x-f2c.c\
+ $(SRC_PATH)/BLAS_dgbmv2_d_s_x-f2c.c\
+ $(SRC_PATH)/BLAS_dgbmv2_s_d_x-f2c.c\
+ $(SRC_PATH)/BLAS_dgbmv2_s_s_x-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_z_c_x-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_c_z_x-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_c_c_x-f2c.c\
+ $(SRC_PATH)/BLAS_cgbmv2_c_s_x-f2c.c\
+ $(SRC_PATH)/BLAS_cgbmv2_s_c_x-f2c.c\
+ $(SRC_PATH)/BLAS_cgbmv2_s_s_x-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_z_d_x-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_d_z_x-f2c.c\
+ $(SRC_PATH)/BLAS_zgbmv2_d_d_x-f2c.c
+
+GBMV2_OBJS = $(GBMV2_SRCS:.c=.o)
+
+all: $(GBMV2_OBJS)
+
+lib: $(GBMV2_OBJS)
+ $(ARCH) $(ARCHFLAGS) $(LIB_PATH)/$(XBLASLIB) $(GBMV2_OBJS)
+ $(RANLIB) $(LIB_PATH)/$(XBLASLIB)
+
+.c.o:
+ $(CC) $(CFLAGS) -I$(HEADER_PATH) -c -o $@ $<
+
+clean:
+ rm -f *.o *~ *.BAK
+
diff --git a/XBLAS/testing/blas_extended_test.h b/XBLAS/testing/blas_extended_test.h
index cab9b7d5..7f75b9bf 100644
--- a/XBLAS/testing/blas_extended_test.h
+++ b/XBLAS/testing/blas_extended_test.h
@@ -1071,13 +1071,110 @@ void BLAS_chemv2_c_s_testgen(int norm, enum blas_order_type order,
void *y, int *seed, double *head_r_true,
double *tail_r_true);
-void BLAS_sge_sum_mv_testgen(int norm, enum blas_order_type order,
- int m, int n, int randomize,
- float *alpha, int alpha_flag, float *beta,
- int beta_flag, float *a, int lda, float *b,
- int ldb, float *x, int incx,
- float *alpha_use_ptr, float *a_use, float *b_use,
- int *seed, double *head_r_true,
+
+void BLAS_sgbmv2_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, float *alpha, int alpha_flag, float *AB,
+ int lda, float *x_head, float *x_tail, float *beta,
+ int beta_flag, float *y, int *seed, double *r_true_l,
+ double *r_true_t);
+void BLAS_dgbmv2_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, double *alpha, int alpha_flag, double *AB,
+ int lda, double *x_head, double *x_tail,
+ double *beta, int beta_flag, double *y, int *seed,
+ double *r_true_l, double *r_true_t);
+void BLAS_cgbmv2_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed, double *r_true_l,
+ double *r_true_t);
+void BLAS_zgbmv2_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed, double *r_true_l,
+ double *r_true_t);
+void BLAS_cgbmv2_s_s_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, float *AB,
+ int lda, float *x_head, float *x_tail,
+ void *beta, int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t);
+void BLAS_cgbmv2_s_c_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, float *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t);
+void BLAS_cgbmv2_c_s_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, float *x_head, float *x_tail,
+ void *beta, int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t);
+void BLAS_zgbmv2_d_d_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, double *AB,
+ int lda, double *x_head, double *x_tail,
+ void *beta, int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t);
+void BLAS_zgbmv2_d_z_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, double *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t);
+void BLAS_zgbmv2_z_d_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, double *x_head, double *x_tail,
+ void *beta, int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t);
+void BLAS_dgbmv2_s_s_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, double *alpha, int alpha_flag, float *AB,
+ int lda, float *x_head, float *x_tail,
+ double *beta, int beta_flag, double *y,
+ int *seed, double *r_true_l, double *r_true_t);
+void BLAS_dgbmv2_s_d_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, double *alpha, int alpha_flag, float *AB,
+ int lda, double *x_head, double *x_tail,
+ double *beta, int beta_flag, double *y,
+ int *seed, double *r_true_l, double *r_true_t);
+void BLAS_dgbmv2_d_s_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, double *alpha, int alpha_flag,
+ double *AB, int lda, float *x_head,
+ float *x_tail, double *beta, int beta_flag,
+ double *y, int *seed, double *r_true_l,
+ double *r_true_t);
+void BLAS_zgbmv2_c_c_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t);
+void BLAS_zgbmv2_c_z_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t);
+void BLAS_zgbmv2_z_c_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t);
+void BLAS_sge_sum_mv_testgen(int norm, enum blas_order_type order, int m,
+ int n, int randomize, float *alpha,
+ int alpha_flag, float *beta, int beta_flag,
+ float *a, int lda, float *b, int ldb, float *x,
+ int incx, float *alpha_use_ptr, float *a_use,
+ float *b_use, int *seed, double *head_r_true,
double *tail_r_true);
void BLAS_dge_sum_mv_testgen(int norm, enum blas_order_type order, int m,
int n, int randomize, double *alpha,
diff --git a/XBLAS/testing/test-gbmv2/BLAS_gbmv2_testgen.c b/XBLAS/testing/test-gbmv2/BLAS_gbmv2_testgen.c
new file mode 100644
index 00000000..99564ff1
--- /dev/null
+++ b/XBLAS/testing/test-gbmv2/BLAS_gbmv2_testgen.c
@@ -0,0 +1,2315 @@
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+#include "blas_extended_test.h"
+
+
+void BLAS_sgbmv2_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, float *alpha, int alpha_flag, float *AB,
+ int lda, float *x_head, float *x_tail, float *beta,
+ int beta_flag, float *y, int *seed, double *r_true_l,
+ double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) float*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) float*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) float*
+ * x_tail (input/output) float*
+ *
+ * beta (input/output) float*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) float*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ float *x_head_i = x_head;
+ float *x_tail_i = x_tail;
+ float *y_i = y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+
+ float *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ float y_elem;
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+
+
+
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ sgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_sdot2_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, &y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem;
+
+
+ /* copy a_vec to AB */
+ sgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_dgbmv2_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, double *alpha, int alpha_flag, double *AB,
+ int lda, double *x_head, double *x_tail,
+ double *beta, int beta_flag, double *y, int *seed,
+ double *r_true_l, double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) double*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) double*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) double*
+ * x_tail (input/output) double*
+ *
+ * beta (input/output) double*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) double*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ double *x_head_i = x_head;
+ double *x_tail_i = x_tail;
+ double *y_i = y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+
+ double *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ double y_elem;
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+
+
+
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ dgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_ddot2_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, &y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem;
+
+
+ /* copy a_vec to AB */
+ dgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_cgbmv2_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed, double *r_true_l,
+ double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) void*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) void*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) void*
+ * x_tail (input/output) void*
+ *
+ * beta (input/output) void*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) void*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ float *x_head_i = (float *) x_head;
+ float *x_tail_i = (float *) x_tail;
+ float *y_i = (float *) y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+ int j;
+ float *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ float y_elem[2];
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+ incy *= 2;
+ incAB *= 2;
+ incx *= 2;
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ cgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_cdot2_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem[0];
+ y_i[i * incy + 1] = y_elem[1];
+
+ if (trans == blas_conj_trans) {
+ for (j = 0; j < n_i * incAB; j += 2) {
+ a_vec[j + 1] = -a_vec[j + 1];
+ }
+ }
+ /* copy a_vec to AB */
+ cgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_head_i[i * incx + 1] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ x_tail_i[i * incx + 1] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_zgbmv2_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed, double *r_true_l,
+ double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) void*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) void*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) void*
+ * x_tail (input/output) void*
+ *
+ * beta (input/output) void*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) void*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ double *x_head_i = (double *) x_head;
+ double *x_tail_i = (double *) x_tail;
+ double *y_i = (double *) y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+ int j;
+ double *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ double y_elem[2];
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+ incy *= 2;
+ incAB *= 2;
+ incx *= 2;
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ zgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_zdot2_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem[0];
+ y_i[i * incy + 1] = y_elem[1];
+
+ if (trans == blas_conj_trans) {
+ for (j = 0; j < n_i * incAB; j += 2) {
+ a_vec[j + 1] = -a_vec[j + 1];
+ }
+ }
+ /* copy a_vec to AB */
+ zgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_head_i[i * incx + 1] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ x_tail_i[i * incx + 1] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_cgbmv2_s_s_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, float *AB,
+ int lda, float *x_head, float *x_tail,
+ void *beta, int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) void*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) float*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) float*
+ * x_tail (input/output) float*
+ *
+ * beta (input/output) void*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) void*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ float *x_head_i = x_head;
+ float *x_tail_i = x_tail;
+ float *y_i = (float *) y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+
+ float *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ float y_elem[2];
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+ incy *= 2;
+
+
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ sgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_cdot2_s_s_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem[0];
+ y_i[i * incy + 1] = y_elem[1];
+
+
+ /* copy a_vec to AB */
+ sgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_cgbmv2_s_c_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, float *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) void*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) float*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) void*
+ * x_tail (input/output) void*
+ *
+ * beta (input/output) void*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) void*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ float *x_head_i = (float *) x_head;
+ float *x_tail_i = (float *) x_tail;
+ float *y_i = (float *) y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+
+ float *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ float y_elem[2];
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+ incy *= 2;
+
+ incx *= 2;
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ sgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_cdot2_c_s_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem[0];
+ y_i[i * incy + 1] = y_elem[1];
+
+
+ /* copy a_vec to AB */
+ sgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_head_i[i * incx + 1] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ x_tail_i[i * incx + 1] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_cgbmv2_c_s_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, float *x_head, float *x_tail,
+ void *beta, int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) void*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) void*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) float*
+ * x_tail (input/output) float*
+ *
+ * beta (input/output) void*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) void*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ float *x_head_i = x_head;
+ float *x_tail_i = x_tail;
+ float *y_i = (float *) y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+ int j;
+ float *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ float y_elem[2];
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+ incy *= 2;
+ incAB *= 2;
+
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ cgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_cdot2_s_c_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem[0];
+ y_i[i * incy + 1] = y_elem[1];
+
+ if (trans == blas_conj_trans) {
+ for (j = 0; j < n_i * incAB; j += 2) {
+ a_vec[j + 1] = -a_vec[j + 1];
+ }
+ }
+ /* copy a_vec to AB */
+ cgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_zgbmv2_d_d_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, double *AB,
+ int lda, double *x_head, double *x_tail,
+ void *beta, int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) void*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) double*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) double*
+ * x_tail (input/output) double*
+ *
+ * beta (input/output) void*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) void*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ double *x_head_i = x_head;
+ double *x_tail_i = x_tail;
+ double *y_i = (double *) y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+
+ double *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ double y_elem[2];
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+ incy *= 2;
+
+
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ dgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_zdot2_d_d_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem[0];
+ y_i[i * incy + 1] = y_elem[1];
+
+
+ /* copy a_vec to AB */
+ dgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_zgbmv2_d_z_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, double *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) void*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) double*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) void*
+ * x_tail (input/output) void*
+ *
+ * beta (input/output) void*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) void*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ double *x_head_i = (double *) x_head;
+ double *x_tail_i = (double *) x_tail;
+ double *y_i = (double *) y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+
+ double *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ double y_elem[2];
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+ incy *= 2;
+
+ incx *= 2;
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ dgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_zdot2_z_d_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem[0];
+ y_i[i * incy + 1] = y_elem[1];
+
+
+ /* copy a_vec to AB */
+ dgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_head_i[i * incx + 1] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ x_tail_i[i * incx + 1] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_zgbmv2_z_d_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, double *x_head, double *x_tail,
+ void *beta, int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) void*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) void*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) double*
+ * x_tail (input/output) double*
+ *
+ * beta (input/output) void*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) void*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ double *x_head_i = x_head;
+ double *x_tail_i = x_tail;
+ double *y_i = (double *) y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+ int j;
+ double *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ double y_elem[2];
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+ incy *= 2;
+ incAB *= 2;
+
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ zgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_zdot2_d_z_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem[0];
+ y_i[i * incy + 1] = y_elem[1];
+
+ if (trans == blas_conj_trans) {
+ for (j = 0; j < n_i * incAB; j += 2) {
+ a_vec[j + 1] = -a_vec[j + 1];
+ }
+ }
+ /* copy a_vec to AB */
+ zgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_dgbmv2_s_s_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, double *alpha, int alpha_flag, float *AB,
+ int lda, float *x_head, float *x_tail,
+ double *beta, int beta_flag, double *y,
+ int *seed, double *r_true_l, double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) double*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) float*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) float*
+ * x_tail (input/output) float*
+ *
+ * beta (input/output) double*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) double*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ float *x_head_i = x_head;
+ float *x_tail_i = x_tail;
+ double *y_i = y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+
+ float *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ double y_elem;
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+
+
+
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ sgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_ddot2_s_s_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, &y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem;
+
+
+ /* copy a_vec to AB */
+ sgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_dgbmv2_s_d_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, double *alpha, int alpha_flag, float *AB,
+ int lda, double *x_head, double *x_tail,
+ double *beta, int beta_flag, double *y,
+ int *seed, double *r_true_l, double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) double*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) float*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) double*
+ * x_tail (input/output) double*
+ *
+ * beta (input/output) double*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) double*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ double *x_head_i = x_head;
+ double *x_tail_i = x_tail;
+ double *y_i = y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+
+ float *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ double y_elem;
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+
+
+
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ sgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_ddot2_d_s_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, &y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem;
+
+
+ /* copy a_vec to AB */
+ sgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_dgbmv2_d_s_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, double *alpha, int alpha_flag,
+ double *AB, int lda, float *x_head,
+ float *x_tail, double *beta, int beta_flag,
+ double *y, int *seed, double *r_true_l,
+ double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) double*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) double*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) float*
+ * x_tail (input/output) float*
+ *
+ * beta (input/output) double*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) double*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ float *x_head_i = x_head;
+ float *x_tail_i = x_tail;
+ double *y_i = y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+
+ double *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ double y_elem;
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+
+
+
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ dgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_ddot2_s_d_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, &y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem;
+
+
+ /* copy a_vec to AB */
+ dgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_zgbmv2_c_c_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) void*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) void*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) void*
+ * x_tail (input/output) void*
+ *
+ * beta (input/output) void*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) void*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ float *x_head_i = (float *) x_head;
+ float *x_tail_i = (float *) x_tail;
+ double *y_i = (double *) y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+ int j;
+ float *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ double y_elem[2];
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+ incy *= 2;
+ incAB *= 2;
+ incx *= 2;
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ cgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_zdot2_c_c_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem[0];
+ y_i[i * incy + 1] = y_elem[1];
+
+ if (trans == blas_conj_trans) {
+ for (j = 0; j < n_i * incAB; j += 2) {
+ a_vec[j + 1] = -a_vec[j + 1];
+ }
+ }
+ /* copy a_vec to AB */
+ cgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_head_i[i * incx + 1] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ x_tail_i[i * incx + 1] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_zgbmv2_c_z_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) void*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) void*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) void*
+ * x_tail (input/output) void*
+ *
+ * beta (input/output) void*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) void*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ double *x_head_i = (double *) x_head;
+ double *x_tail_i = (double *) x_tail;
+ double *y_i = (double *) y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+ int j;
+ float *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ double y_elem[2];
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+ incy *= 2;
+ incAB *= 2;
+ incx *= 2;
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ cgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_zdot2_z_c_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem[0];
+ y_i[i * incy + 1] = y_elem[1];
+
+ if (trans == blas_conj_trans) {
+ for (j = 0; j < n_i * incAB; j += 2) {
+ a_vec[j + 1] = -a_vec[j + 1];
+ }
+ }
+ /* copy a_vec to AB */
+ cgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_head_i[i * incx + 1] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ x_tail_i[i * incx + 1] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
+void BLAS_zgbmv2_z_c_testgen(int norm, enum blas_order_type order,
+ enum blas_trans_type trans, int m, int n, int kl,
+ int ku, void *alpha, int alpha_flag, void *AB,
+ int lda, void *x_head, void *x_tail, void *beta,
+ int beta_flag, void *y, int *seed,
+ double *r_true_l, double *r_true_t)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Generates alpha, AB, x, beta, and y, where AB is a banded
+ * matrix; and computes r_true.
+ *
+ * Arguments
+ * =========
+ *
+ * norm (input) blas_norm_type
+ *
+ * order (input) blas_order_type
+ * Order of AB; row or column major
+ *
+ * trans (input) blas_trans_type
+ * Whether AB is no trans, trans, or conj trans
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * kl (input) int
+ * The number of subdiagonals
+ *
+ * ku (input) int
+ * The number of superdiagonals
+ *
+ * alpha (input/output) void*
+ * If alpha_flag = 1, alpha is input.
+ * If alpha_flag = 0, alpha is output.
+ *
+ * alpha_flag (input) int
+ * = 0 : alpha is free, and is output.
+ * = 1 : alpha is fixed on input.
+ *
+ * AB (output) void*
+ * Matrix A in the banded storage.
+ *
+ *
+ * lda (input) int
+ * The first dimension of AB
+ *
+ * x_head (input/output) void*
+ * x_tail (input/output) void*
+ *
+ * beta (input/output) void*
+ * If beta_flag = 1, beta is input.
+ * If beta_flag = 0, beta is output.
+ *
+ * beta_flag (input) int
+ * = 0 : beta is free, and is output.
+ * = 1 : beta is fixed on input.
+ *
+ * y (input/output) void*
+ *
+ * seed (input/output) int
+ *
+ * r_true_l (output) double*
+ * The leading part of the truth in double-double.
+ *
+ * r_true_t (output) double*
+ * The trailing part of the truth in double-double.
+ *
+ */
+{
+ float *x_head_i = (float *) x_head;
+ float *x_tail_i = (float *) x_tail;
+ double *y_i = (double *) y;
+ int n_fix2;
+ int n_mix;
+ int ysize;
+ int i;
+ int j;
+ double *a_vec;
+ int m_i, n_i;
+ int max_mn;
+ int incy, incAB, incx;
+ double y_elem[2];
+
+ max_mn = MAX(m, n);
+ incx = incy = incAB = 1;
+ incy *= 2;
+ incAB *= 2;
+ incx *= 2;
+
+ if (trans == blas_no_trans) {
+ m_i = m;
+ n_i = n;
+ } else {
+ m_i = n;
+ n_i = m;
+ }
+
+ a_vec = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && a_vec == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* calling dot_testgen n time. in each iteration, one row of AB, and one
+ element of y are produced. the vector x is produced at the first
+ iteration only */
+ for (i = 0; i < m_i; i++) {
+ /* copy AB to a_vec */
+ zgbmv_prepare(order, trans, m, n, kl, ku, AB, lda, a_vec, i,
+ &n_fix2, &n_mix, &ysize);
+
+ if (i == 1) {
+ /* from now on, fix alpha and beta */
+ alpha_flag = 1;
+ beta_flag = 1;
+ }
+
+ BLAS_zdot2_c_z_testgen(ysize, n_fix2, n_mix, norm, blas_no_conj, alpha,
+ alpha_flag, beta, beta_flag, x_head, x_tail, a_vec,
+ seed, y_elem, &r_true_l[i * incy],
+ &r_true_t[i * incy]);
+ y_i[i * incy] = y_elem[0];
+ y_i[i * incy + 1] = y_elem[1];
+
+ if (trans == blas_conj_trans) {
+ for (j = 0; j < n_i * incAB; j += 2) {
+ a_vec[j + 1] = -a_vec[j + 1];
+ }
+ }
+ /* copy a_vec to AB */
+ zgbmv_commit(order, trans, m, n, kl, ku, AB, lda, a_vec, i);
+ }
+
+ /* Zero out trailing part of x */
+ for (i = ysize; i < n_i; i++) {
+ x_head_i[i * incx] = 0.0;
+ x_head_i[i * incx + 1] = 0.0;
+ x_tail_i[i * incx] = 0.0;
+ x_tail_i[i * incx + 1] = 0.0;
+ }
+
+ blas_free(a_vec);
+}
diff --git a/XBLAS/testing/test-gbmv2/Makefile b/XBLAS/testing/test-gbmv2/Makefile
new file mode 100644
index 00000000..533f0f49
--- /dev/null
+++ b/XBLAS/testing/test-gbmv2/Makefile
@@ -0,0 +1,43 @@
+include ../../make.conf
+include ../../$(MAKEINC)
+
+TEST_DOT_PATH = ../test-dot
+TEST_DOT2_PATH = ../test-dot2
+TEST_GBMV_PATH=../test-gbmv
+LIB_PATH = ../../$(OUTPUT_DIR)
+HEADER_PATH = -I../../src -I..
+
+TEST_DOT_OBJS = $(TEST_DOT_PATH)/testgen_aux.o \
+ $(TEST_DOT_PATH)/print_vector.o \
+ $(TEST_DOT_PATH)/copy_vector.o
+TEST_DOT2_OBJS = $(TEST_DOT2_PATH)/dot2.o \
+ $(TEST_DOT2_PATH)/testgen_BLAS_sdot2.o \
+ $(TEST_DOT2_PATH)/testgen_BLAS_ddot2.o \
+ $(TEST_DOT2_PATH)/testgen_BLAS_cdot2.o \
+ $(TEST_DOT2_PATH)/testgen_BLAS_zdot2.o \
+ $(TEST_DOT2_PATH)/test_dot2.o \
+ $(TEST_DOT2_PATH)/r_truth2.o \
+ $(TEST_DOT2_PATH)/BLAS_dot2_testgen.o
+TEST_GBMV_OBJS=../test-gbmv/gbmv-support.o
+
+TEST_SRCS = do_test_gbmv2.c BLAS_gbmv2_testgen.c
+TEST_OBJS = $(TEST_SRCS:.c=.o)
+COMMON_OBJS = ../common/dummy_main.o
+
+ALL_OBJS = $(TEST_OBJS) $(TEST_DOT_OBJS) $(TEST_DOT2_OBJS) \
+ $(TEST_GBMV_OBJS) $(COMMON_OBJS)
+
+test: do_test_gbmv2
+ @echo Testing GBMV2
+ ./do_test_gbmv2 12 1 1.0 0 0.01 > gbmv2.results
+
+do_test_gbmv2: $(ALL_OBJS) $(LIB_PATH)/$(XBLASLIB)
+ $(LINKER) $(LDFLAGS) $(ALL_OBJS) $(LIB_PATH)/$(XBLASLIB) \
+ -o do_test_gbmv2 $(EXTRA_LIBS)
+
+.c.o:
+ $(CC) $(CFLAGS) $(HEADER_PATH) -c -o $@ $<
+
+clean:
+ rm -f *.o *~ *.BAK gbmv2.results do_test_gbmv2 core
+
diff --git a/XBLAS/testing/test-gbmv2/do_test_gbmv2.c b/XBLAS/testing/test-gbmv2/do_test_gbmv2.c
new file mode 100644
index 00000000..bc76502d
--- /dev/null
+++ b/XBLAS/testing/test-gbmv2/do_test_gbmv2.c
@@ -0,0 +1,18419 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "blas_extended.h"
+#include "blas_extended_private.h"
+#include "blas_extended_test.h"
+
+
+double do_test_dgbmv2_d_s(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_dgbmv2_d_s";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha;
+ double beta;
+ double *AB;
+ float *x_head;
+ float *x_tail;
+ double *y;
+ double *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_dgbmv2_d_s_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_dgbmv2_d_s_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha = 1.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta = 1.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_dgbmv2_d_s_testgen(norm, order_type, trans_type,
+ m, n, kl, ku, &alpha,
+ alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen, &beta,
+ beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ scopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+
+
+ dcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_dgbmv2_d_s */
+ FPU_FIX_STOP;
+ BLAS_dgbmv2_d_s(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_ddot2_d_s(n_i, blas_no_conj, alpha,
+ beta, y_gen[j * incy_gen],
+ y[iy],
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ dprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("%24.16e", y_gen[k * incy_gen]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("%24.16e", y[iy]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("%24.16e", alpha);
+ printf("\n ");
+ printf("beta = ");
+ printf("%24.16e", beta);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf("[%24.16e, %24.16e]",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_dgbmv2_s_d(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_dgbmv2_s_d";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha;
+ double beta;
+ float *AB;
+ double *x_head;
+ double *x_tail;
+ double *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ double *x_head_gen;
+ double *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_dgbmv2_s_d_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_dgbmv2_s_d_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+
+
+ /* get space for calculation */
+ x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha = 1.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta = 1.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_dgbmv2_s_d_testgen(norm, order_type, trans_type,
+ m, n, kl, ku, &alpha,
+ alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen, &beta,
+ beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+
+
+ dcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_dgbmv2_s_d */
+ FPU_FIX_STOP;
+ BLAS_dgbmv2_s_d(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_ddot2_s_d(n_i, blas_no_conj, alpha,
+ beta, y_gen[j * incy_gen],
+ y[iy],
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ sprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("%24.16e", y_gen[k * incy_gen]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("%24.16e", y[iy]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("%24.16e", alpha);
+ printf("\n ");
+ printf("beta = ");
+ printf("%24.16e", beta);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf("[%24.16e, %24.16e]",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_dgbmv2_s_s(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_dgbmv2_s_s";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha;
+ double beta;
+ float *AB;
+ float *x_head;
+ float *x_tail;
+ double *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_dgbmv2_s_s_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_dgbmv2_s_s_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha = 1.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta = 1.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_dgbmv2_s_s_testgen(norm, order_type, trans_type,
+ m, n, kl, ku, &alpha,
+ alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen, &beta,
+ beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ scopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+
+
+ dcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_dgbmv2_s_s */
+ FPU_FIX_STOP;
+ BLAS_dgbmv2_s_s(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_ddot2_s_s(n_i, blas_no_conj, alpha,
+ beta, y_gen[j * incy_gen],
+ y[iy],
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ sprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("%24.16e", y_gen[k * incy_gen]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("%24.16e", y[iy]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("%24.16e", alpha);
+ printf("\n ");
+ printf("beta = ");
+ printf("%24.16e", beta);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf("[%24.16e, %24.16e]",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_z_c(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_z_c";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ double *AB;
+ float *x_head;
+ float *x_tail;
+ double *y;
+ double *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_z_c_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_z_c_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+ incx_gen *= 2;
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double) *
+ 2);
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_z_c_testgen(norm, order_type, trans_type,
+ m, n, kl, ku, &alpha,
+ alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen, &beta,
+ beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+ incx *= 2;
+
+ ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_z_c */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_z_c(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ zgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2_z_c(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ zgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ zprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_head[ix],
+ x_head[ix + 1]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_tail[ix],
+ x_tail[ix + 1]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_c_z(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_c_z";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ float *AB;
+ double *x_head;
+ double *x_tail;
+ double *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ double *x_head_gen;
+ double *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_c_z_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_c_z_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+ incx_gen *= 2;
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) *
+ 2);
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_c_z_testgen(norm, order_type, trans_type,
+ m, n, kl, ku, &alpha,
+ alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen, &beta,
+ beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+ incx *= 2;
+
+ zcopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ zcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_c_z */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_c_z(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2_c_z(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ cprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)", x_head[ix],
+ x_head[ix + 1]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)", x_tail[ix],
+ x_tail[ix + 1]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_c_c(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_c_c";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ float *AB;
+ float *x_head;
+ float *x_tail;
+ double *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_c_c_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_c_c_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+ incx_gen *= 2;
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) *
+ 2);
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_c_c_testgen(norm, order_type, trans_type,
+ m, n, kl, ku, &alpha,
+ alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen, &beta,
+ beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+ incx *= 2;
+
+ ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_c_c */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_c_c(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2_c_c(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ cprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_head[ix],
+ x_head[ix + 1]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_tail[ix],
+ x_tail[ix + 1]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_cgbmv2_c_s(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_cgbmv2_c_s";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ float alpha[2];
+ float beta[2];
+ float *AB;
+ float *x_head;
+ float *x_tail;
+ float *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ float *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_cgbmv2_c_s_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_cgbmv2_c_s_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) *
+ 2);
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ eps_int = power(2, -BITS_S);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_single));
+ prec = blas_prec_single;
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_cgbmv2_c_s_testgen(norm, order_type, trans_type,
+ m, n, kl, ku, &alpha,
+ alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen, &beta,
+ beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ scopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ ccopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_cgbmv2_c_s */
+ FPU_FIX_STOP;
+ BLAS_cgbmv2_c_s(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_cdot2_c_s(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ cprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%16.8e, %16.8e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%16.8e, %16.8e)", alpha[0], alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%16.8e, %16.8e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_cgbmv2_s_c(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_cgbmv2_s_c";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ float alpha[2];
+ float beta[2];
+ float *AB;
+ float *x_head;
+ float *x_tail;
+ float *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ float *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_cgbmv2_s_c_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_cgbmv2_s_c_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+ incx_gen *= 2;
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ eps_int = power(2, -BITS_S);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_single));
+ prec = blas_prec_single;
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_cgbmv2_s_c_testgen(norm, order_type, trans_type,
+ m, n, kl, ku, &alpha,
+ alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen, &beta,
+ beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+ incx *= 2;
+
+ ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ ccopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_cgbmv2_s_c */
+ FPU_FIX_STOP;
+ BLAS_cgbmv2_s_c(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_cdot2_s_c(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ sprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_head[ix],
+ x_head[ix + 1]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_tail[ix],
+ x_tail[ix + 1]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%16.8e, %16.8e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%16.8e, %16.8e)", alpha[0], alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%16.8e, %16.8e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_cgbmv2_s_s(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_cgbmv2_s_s";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ float alpha[2];
+ float beta[2];
+ float *AB;
+ float *x_head;
+ float *x_tail;
+ float *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ float *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_cgbmv2_s_s_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_cgbmv2_s_s_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ eps_int = power(2, -BITS_S);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_single));
+ prec = blas_prec_single;
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_cgbmv2_s_s_testgen(norm, order_type, trans_type,
+ m, n, kl, ku, &alpha,
+ alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen, &beta,
+ beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ scopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ ccopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_cgbmv2_s_s */
+ FPU_FIX_STOP;
+ BLAS_cgbmv2_s_s(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_cdot2_s_s(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ sprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%16.8e, %16.8e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%16.8e, %16.8e)", alpha[0], alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%16.8e, %16.8e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_z_d(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_z_d";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ double *AB;
+ double *x_head;
+ double *x_tail;
+ double *y;
+ double *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ double *x_head_gen;
+ double *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_z_d_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_z_d_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double) *
+ 2);
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_z_d_testgen(norm, order_type, trans_type,
+ m, n, kl, ku, &alpha,
+ alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen, &beta,
+ beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_z_d */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_z_d(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ zgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2_z_d(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ zgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ zprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_d_z(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_d_z";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ double *AB;
+ double *x_head;
+ double *x_tail;
+ double *y;
+ double *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ double *x_head_gen;
+ double *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_d_z_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_d_z_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+ incx_gen *= 2;
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_d_z_testgen(norm, order_type, trans_type,
+ m, n, kl, ku, &alpha,
+ alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen, &beta,
+ beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+ incx *= 2;
+
+ zcopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ zcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_d_z */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_d_z(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2_d_z(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ dprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)", x_head[ix],
+ x_head[ix + 1]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)", x_tail[ix],
+ x_tail[ix + 1]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_d_d(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_d_d";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ double *AB;
+ double *x_head;
+ double *x_tail;
+ double *y;
+ double *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ double *x_head_gen;
+ double *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_d_d_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_d_d_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_d_d_testgen(norm, order_type, trans_type,
+ m, n, kl, ku, &alpha,
+ alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen, &beta,
+ beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_d_d */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_d_d(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2_d_d(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ dprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i; j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_sgbmv2_x(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_sgbmv2_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ float alpha;
+ float beta;
+ float *AB;
+ float *x_head;
+ float *x_tail;
+ float *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ float *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_sgbmv2_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_sgbmv2_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha = 1.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta = 1.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_S);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_single));
+ prec = blas_prec_single;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_sgbmv2_testgen(norm, order_type, trans_type, m,
+ n, kl, ku, &alpha, alpha_flag,
+ AB, lda, x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ scopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+
+
+ scopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_sgbmv2_x */
+ FPU_FIX_STOP;
+ BLAS_sgbmv2_x(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_sdot2(n_i, blas_no_conj, alpha, beta,
+ y_gen[j * incy_gen],
+ y[iy],
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen], temp,
+ 1, x_head, x_tail, incx_val,
+ eps_int, un_int, &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ sprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("%16.8e", y_gen[k * incy_gen]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("%16.8e", y[iy]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("%16.8e", alpha);
+ printf("\n ");
+ printf("beta = ");
+ printf("%16.8e", beta);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf("[%24.16e, %24.16e]",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_dgbmv2_x(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_dgbmv2_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha;
+ double beta;
+ double *AB;
+ double *x_head;
+ double *x_tail;
+ double *y;
+ double *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ double *x_head_gen;
+ double *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_dgbmv2_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_dgbmv2_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+
+
+ /* get space for calculation */
+ x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha = 1.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta = 1.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_dgbmv2_testgen(norm, order_type, trans_type, m,
+ n, kl, ku, &alpha, alpha_flag,
+ AB, lda, x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+
+
+ dcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_dgbmv2_x */
+ FPU_FIX_STOP;
+ BLAS_dgbmv2_x(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_ddot2(n_i, blas_no_conj, alpha, beta,
+ y_gen[j * incy_gen],
+ y[iy],
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen], temp,
+ 1, x_head, x_tail, incx_val,
+ eps_int, un_int, &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ dprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("%24.16e", y_gen[k * incy_gen]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("%24.16e", y[iy]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("%24.16e", alpha);
+ printf("\n ");
+ printf("beta = ");
+ printf("%24.16e", beta);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf("[%24.16e, %24.16e]",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_cgbmv2_x(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_cgbmv2_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ float alpha[2];
+ float beta[2];
+ float *AB;
+ float *x_head;
+ float *x_tail;
+ float *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ float *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_cgbmv2_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_cgbmv2_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+ incx_gen *= 2;
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) *
+ 2);
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_S);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_single));
+ prec = blas_prec_single;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_cgbmv2_testgen(norm, order_type, trans_type, m,
+ n, kl, ku, &alpha, alpha_flag,
+ AB, lda, x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+ incx *= 2;
+
+ ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ ccopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_cgbmv2_x */
+ FPU_FIX_STOP;
+ BLAS_cgbmv2_x(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_cdot2(n_i, blas_no_conj, alpha, beta,
+ &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ cprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_head[ix],
+ x_head[ix + 1]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_tail[ix],
+ x_tail[ix + 1]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%16.8e, %16.8e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%16.8e, %16.8e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%16.8e, %16.8e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_x(int m, int n, int ntests, int *seed, double thresh,
+ int debug, float test_prob, double *min_ratio,
+ int *num_bad_ratio, int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ double *AB;
+ double *x_head;
+ double *x_tail;
+ double *y;
+ double *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ double *x_head_gen;
+ double *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+ incx_gen *= 2;
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double) *
+ 2);
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_testgen(norm, order_type, trans_type, m,
+ n, kl, ku, &alpha, alpha_flag,
+ AB, lda, x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen, seed,
+ head_r_true, tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+ incx *= 2;
+
+ zcopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ zcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_x */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_x(order_type, trans_type, m, n, kl,
+ ku, alpha, AB, lda, x_head, x_tail,
+ incx_val, beta, y, incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ zgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2(n_i, blas_no_conj, alpha, beta,
+ &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ zgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ zprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)", x_head[ix],
+ x_head[ix + 1]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)", x_tail[ix],
+ x_tail[ix + 1]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0],
+ beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_dgbmv2_d_s_x(int m, int n, int ntests, int *seed,
+ double thresh, int debug, float test_prob,
+ double *min_ratio, int *num_bad_ratio,
+ int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_dgbmv2_d_s_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha;
+ double beta;
+ double *AB;
+ float *x_head;
+ float *x_tail;
+ double *y;
+ double *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_dgbmv2_d_s_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_dgbmv2_d_s_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha = 1.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta = 1.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_dgbmv2_d_s_testgen(norm, order_type,
+ trans_type, m, n, kl, ku,
+ &alpha, alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen,
+ seed, head_r_true,
+ tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ scopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+
+
+ dcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_dgbmv2_d_s_x */
+ FPU_FIX_STOP;
+ BLAS_dgbmv2_d_s_x(order_type, trans_type, m, n,
+ kl, ku, alpha, AB, lda, x_head,
+ x_tail, incx_val, beta, y,
+ incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_ddot2_d_s(n_i, blas_no_conj, alpha,
+ beta, y_gen[j * incy_gen],
+ y[iy],
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ dprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("%24.16e", y_gen[k * incy_gen]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("%24.16e", y[iy]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("%24.16e", alpha);
+ printf("\n ");
+ printf("beta = ");
+ printf("%24.16e", beta);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf("[%24.16e, %24.16e]",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_dgbmv2_s_d_x(int m, int n, int ntests, int *seed,
+ double thresh, int debug, float test_prob,
+ double *min_ratio, int *num_bad_ratio,
+ int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_dgbmv2_s_d_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha;
+ double beta;
+ float *AB;
+ double *x_head;
+ double *x_tail;
+ double *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ double *x_head_gen;
+ double *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_dgbmv2_s_d_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_dgbmv2_s_d_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+
+
+ /* get space for calculation */
+ x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha = 1.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta = 1.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_dgbmv2_s_d_testgen(norm, order_type,
+ trans_type, m, n, kl, ku,
+ &alpha, alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen,
+ seed, head_r_true,
+ tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+
+
+ dcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_dgbmv2_s_d_x */
+ FPU_FIX_STOP;
+ BLAS_dgbmv2_s_d_x(order_type, trans_type, m, n,
+ kl, ku, alpha, AB, lda, x_head,
+ x_tail, incx_val, beta, y,
+ incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_ddot2_s_d(n_i, blas_no_conj, alpha,
+ beta, y_gen[j * incy_gen],
+ y[iy],
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ sprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("%24.16e", y_gen[k * incy_gen]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("%24.16e", y[iy]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("%24.16e", alpha);
+ printf("\n ");
+ printf("beta = ");
+ printf("%24.16e", beta);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf("[%24.16e, %24.16e]",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_dgbmv2_s_s_x(int m, int n, int ntests, int *seed,
+ double thresh, int debug, float test_prob,
+ double *min_ratio, int *num_bad_ratio,
+ int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_dgbmv2_s_s_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha;
+ double beta;
+ float *AB;
+ float *x_head;
+ float *x_tail;
+ double *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_dgbmv2_s_s_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_dgbmv2_s_s_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha = 1.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta = 1.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_dgbmv2_s_s_testgen(norm, order_type,
+ trans_type, m, n, kl, ku,
+ &alpha, alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen,
+ seed, head_r_true,
+ tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ scopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+
+
+ dcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_dgbmv2_s_s_x */
+ FPU_FIX_STOP;
+ BLAS_dgbmv2_s_s_x(order_type, trans_type, m, n,
+ kl, ku, alpha, AB, lda, x_head,
+ x_tail, incx_val, beta, y,
+ incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_ddot2_s_s(n_i, blas_no_conj, alpha,
+ beta, y_gen[j * incy_gen],
+ y[iy],
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ sprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("%24.16e", y_gen[k * incy_gen]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("%24.16e", y[iy]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("%24.16e", alpha);
+ printf("\n ");
+ printf("beta = ");
+ printf("%24.16e", beta);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf("[%24.16e, %24.16e]",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_z_c_x(int m, int n, int ntests, int *seed,
+ double thresh, int debug, float test_prob,
+ double *min_ratio, int *num_bad_ratio,
+ int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_z_c_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ double *AB;
+ float *x_head;
+ float *x_tail;
+ double *y;
+ double *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_z_c_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_z_c_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+ incx_gen *= 2;
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double) *
+ 2);
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_z_c_testgen(norm, order_type,
+ trans_type, m, n, kl, ku,
+ &alpha, alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen,
+ seed, head_r_true,
+ tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+ incx *= 2;
+
+ ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_z_c_x */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_z_c_x(order_type, trans_type, m, n,
+ kl, ku, alpha, AB, lda, x_head,
+ x_tail, incx_val, beta, y,
+ incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ zgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2_z_c(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ zgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ zprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_head[ix],
+ x_head[ix + 1]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_tail[ix],
+ x_tail[ix + 1]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0],
+ beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_c_z_x(int m, int n, int ntests, int *seed,
+ double thresh, int debug, float test_prob,
+ double *min_ratio, int *num_bad_ratio,
+ int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_c_z_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ float *AB;
+ double *x_head;
+ double *x_tail;
+ double *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ double *x_head_gen;
+ double *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_c_z_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_c_z_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+ incx_gen *= 2;
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) *
+ 2);
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_c_z_testgen(norm, order_type,
+ trans_type, m, n, kl, ku,
+ &alpha, alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen,
+ seed, head_r_true,
+ tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+ incx *= 2;
+
+ zcopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ zcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_c_z_x */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_c_z_x(order_type, trans_type, m, n,
+ kl, ku, alpha, AB, lda, x_head,
+ x_tail, incx_val, beta, y,
+ incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2_c_z(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ cprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)", x_head[ix],
+ x_head[ix + 1]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)", x_tail[ix],
+ x_tail[ix + 1]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0],
+ beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_c_c_x(int m, int n, int ntests, int *seed,
+ double thresh, int debug, float test_prob,
+ double *min_ratio, int *num_bad_ratio,
+ int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_c_c_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ float *AB;
+ float *x_head;
+ float *x_tail;
+ double *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_c_c_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_c_c_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+ incx_gen *= 2;
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) *
+ 2);
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_c_c_testgen(norm, order_type,
+ trans_type, m, n, kl, ku,
+ &alpha, alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen,
+ seed, head_r_true,
+ tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+ incx *= 2;
+
+ ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_c_c_x */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_c_c_x(order_type, trans_type, m, n,
+ kl, ku, alpha, AB, lda, x_head,
+ x_tail, incx_val, beta, y,
+ incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2_c_c(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ cprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_head[ix],
+ x_head[ix + 1]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_tail[ix],
+ x_tail[ix + 1]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0],
+ beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_cgbmv2_c_s_x(int m, int n, int ntests, int *seed,
+ double thresh, int debug, float test_prob,
+ double *min_ratio, int *num_bad_ratio,
+ int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_cgbmv2_c_s_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ float alpha[2];
+ float beta[2];
+ float *AB;
+ float *x_head;
+ float *x_tail;
+ float *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ float *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_cgbmv2_c_s_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_cgbmv2_c_s_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float) *
+ 2);
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_S);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_single));
+ prec = blas_prec_single;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_cgbmv2_c_s_testgen(norm, order_type,
+ trans_type, m, n, kl, ku,
+ &alpha, alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen,
+ seed, head_r_true,
+ tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ scopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ ccopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_cgbmv2_c_s_x */
+ FPU_FIX_STOP;
+ BLAS_cgbmv2_c_s_x(order_type, trans_type, m, n,
+ kl, ku, alpha, AB, lda, x_head,
+ x_tail, incx_val, beta, y,
+ incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_cdot2_c_s(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ cgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ cprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%16.8e, %16.8e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%16.8e, %16.8e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%16.8e, %16.8e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_cgbmv2_s_c_x(int m, int n, int ntests, int *seed,
+ double thresh, int debug, float test_prob,
+ double *min_ratio, int *num_bad_ratio,
+ int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_cgbmv2_s_c_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ float alpha[2];
+ float beta[2];
+ float *AB;
+ float *x_head;
+ float *x_tail;
+ float *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ float *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_cgbmv2_s_c_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_cgbmv2_s_c_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+ incx_gen *= 2;
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_S);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_single));
+ prec = blas_prec_single;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_cgbmv2_s_c_testgen(norm, order_type,
+ trans_type, m, n, kl, ku,
+ &alpha, alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen,
+ seed, head_r_true,
+ tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+ incx *= 2;
+
+ ccopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ ccopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ ccopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_cgbmv2_s_c_x */
+ FPU_FIX_STOP;
+ BLAS_cgbmv2_s_c_x(order_type, trans_type, m, n,
+ kl, ku, alpha, AB, lda, x_head,
+ x_tail, incx_val, beta, y,
+ incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_cdot2_s_c(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ sprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_head[ix],
+ x_head[ix + 1]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)", x_tail[ix],
+ x_tail[ix + 1]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%16.8e, %16.8e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%16.8e, %16.8e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%16.8e, %16.8e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_cgbmv2_s_s_x(int m, int n, int ntests, int *seed,
+ double thresh, int debug, float test_prob,
+ double *min_ratio, int *num_bad_ratio,
+ int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_cgbmv2_s_s_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ float alpha[2];
+ float beta[2];
+ float *AB;
+ float *x_head;
+ float *x_tail;
+ float *y;
+ float *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ float *x_head_gen;
+ float *x_tail_gen;
+ float *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_cgbmv2_s_s_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_cgbmv2_s_s_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (float *) blas_malloc(max_mn * 2 * sizeof(float));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (float *) blas_malloc(max_mn * 2 * sizeof(float) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (float *) blas_malloc(max_mn * sizeof(float) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (float *) blas_malloc(max_mn * sizeof(float));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (float *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(float));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_S);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_single),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_single));
+ prec = blas_prec_single;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_cgbmv2_s_s_testgen(norm, order_type,
+ trans_type, m, n, kl, ku,
+ &alpha, alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen,
+ seed, head_r_true,
+ tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ scopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ scopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ ccopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_cgbmv2_s_s_x */
+ FPU_FIX_STOP;
+ BLAS_cgbmv2_s_s_x(order_type, trans_type, m, n,
+ kl, ku, alpha, AB, lda, x_head,
+ x_tail, incx_val, beta, y,
+ incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_cdot2_s_s(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ sgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ sprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%16.8e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%16.8e, %16.8e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%16.8e, %16.8e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%16.8e, %16.8e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%16.8e, %16.8e)", beta[0], beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_z_d_x(int m, int n, int ntests, int *seed,
+ double thresh, int debug, float test_prob,
+ double *min_ratio, int *num_bad_ratio,
+ int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_z_d_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ double *AB;
+ double *x_head;
+ double *x_tail;
+ double *y;
+ double *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ double *x_head_gen;
+ double *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_z_d_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_z_d_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double) *
+ 2);
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_z_d_testgen(norm, order_type,
+ trans_type, m, n, kl, ku,
+ &alpha, alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen,
+ seed, head_r_true,
+ tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_z_d_x */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_z_d_x(order_type, trans_type, m, n,
+ kl, ku, alpha, AB, lda, x_head,
+ x_tail, incx_val, beta, y,
+ incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ zgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2_z_d(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ zgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ zprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0],
+ beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_d_z_x(int m, int n, int ntests, int *seed,
+ double thresh, int debug, float test_prob,
+ double *min_ratio, int *num_bad_ratio,
+ int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_d_z_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ double *AB;
+ double *x_head;
+ double *x_tail;
+ double *y;
+ double *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ double *x_head_gen;
+ double *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_d_z_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_d_z_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+ incx_gen *= 2;
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_d_z_testgen(norm, order_type,
+ trans_type, m, n, kl, ku,
+ &alpha, alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen,
+ seed, head_r_true,
+ tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+ incx *= 2;
+
+ zcopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ zcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_d_z_x */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_d_z_x(order_type, trans_type, m, n,
+ kl, ku, alpha, AB, lda, x_head,
+ x_tail, incx_val, beta, y,
+ incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2_d_z(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ dprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)", x_head[ix],
+ x_head[ix + 1]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)", x_tail[ix],
+ x_tail[ix + 1]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0],
+ beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+double do_test_zgbmv2_d_d_x(int m, int n, int ntests, int *seed,
+ double thresh, int debug, float test_prob,
+ double *min_ratio, int *num_bad_ratio,
+ int *num_tests)
+
+/*
+ * Purpose
+ * =======
+ *
+ * Runs a series of tests on GBMV2.
+ *
+ * Arguments
+ * =========
+ *
+ * m (input) int
+ * The number of rows
+ *
+ * n (input) int
+ * The number of columns
+ *
+ * ntests (input) int
+ * The number of tests to run for each set of attributes.
+ *
+ * seed (input/output) int
+ * The seed for the random number generator used in testgen().
+ *
+ * thresh (input) double
+ * When the ratio returned from test() exceeds the specified
+ * threshold, the current size, r_true, r_comp, and ratio will be
+ * printed. (Since ratio is supposed to be O(1), we can set thresh
+ * to ~10.)
+ *
+ * debug (input) int
+ * If debug=3, print summary
+ * If debug=2, print summary only if the number of bad ratios > 0
+ * If debug=1, print complete info if tests fail
+ * If debug=0, return max ratio
+ *
+ * test_prob (input) float
+ * The specified test will be performed only if the generated
+ * random exceeds this threshold.
+ *
+ * min_ratio (output) double
+ * The minimum ratio
+ *
+ * num_bad_ratio (output) int
+ * The number of tests fail; they are above the threshold.
+ *
+ * num_tests (output) int
+ * The number of tests is being performed.
+ *
+ * Return value
+ * ============
+ *
+ * The maximum ratio if run successfully, otherwise return -1
+ *
+ * Code structure
+ * ==============
+ *
+ * debug loop -- if debug is one, the first loop computes the max ratio
+ * -- and the last(second) loop outputs debugging information,
+ * -- if the test fail and its ratio > 0.5 * max ratio.
+ * -- if debug is zero, the loop is executed once
+ * alpha loop -- varying alpha: 0, 1, or random
+ * beta loop -- varying beta: 0, 1, or random
+ * prec loop -- varying internal prec: single, double, or extra
+ * norm loop -- varying norm: near undeflow, near one, or
+ * -- near overflow
+ * numtest loop -- how many times the test is perform with
+ * -- above set of attributes
+ * order loop -- varying order type: rowmajor or colmajor
+ * trans loop -- varying trans type: no trans, trans, or conj trans
+ * ku loop -- varying ku: 0 to n-1
+ * kl loop -- varying kl: 0 to m-1
+ * lda loop -- varying lda: ku+kl+1, ku+kl+2, 2*(ku+kl+1)
+ * incx loop -- varying incx: -2, -1, 1, 2
+ * incy loop -- varying incy: -2, -1, 1, 2
+ */
+{
+ /* function name */
+ const char fname[] = "BLAS_zgbmv2_d_d_x";
+
+ /* max number of debug lines to print */
+ const int max_print = 8;
+
+ /* Variables in the "x_val" form are loop vars for corresponding
+ variables */
+ int i; /* iterate through the repeating tests */
+ int j;
+ int k; /* multipurpose counters or variables */
+ int ix, iy; /* use to index x and y respectively */
+ int incx_val, incy_val, /* for testing different inc values */
+ incx, incy;
+ int incx_gen, incy_gen; /* for complex case inc=2, for real case inc=1 */
+ int d_count; /* counter for debug */
+ int find_max_ratio; /* find_max_ratio = 1 only if debug = 3 */
+ int p_count; /* counter for the number of debug lines printed */
+ int tot_tests; /* total number of tests to be done */
+ int norm; /* input values of near underflow/one/overflow */
+ double ratio_max; /* the current maximum ratio */
+ double ratio_min; /* the current minimum ratio */
+ double *ratios; /* a temporary variable for calculating ratio */
+ double ratio; /* the per-use test ratio from test() */
+ int bad_ratios = 0; /* the number of ratios over the threshold */
+ double eps_int; /* the internal epsilon expected--2^(-24) for float */
+ double un_int; /* the internal underflow threshold */
+ double alpha[2];
+ double beta[2];
+ double *AB;
+ double *x_head;
+ double *x_tail;
+ double *y;
+ double *temp; /* use for calculating ratio */
+
+ /* x_gen and y_gen are used to store vectors generated by testgen.
+ they eventually are copied back to x and y */
+ double *x_head_gen;
+ double *x_tail_gen;
+ double *y_gen;
+
+ /* the true r calculated by testgen(), in double-double */
+ double *head_r_true, *tail_r_true;
+
+ int alpha_val;
+ int alpha_flag = 0; /* input flag for BLAS_zgbmv2_d_d_testgen */
+ int beta_val;
+ int beta_flag = 0; /* input flag for BLAS_zgbmv2_d_d_testgen */
+ int order_val;
+ enum blas_order_type order_type = 0;
+ int prec_val;
+ enum blas_prec_type prec = 0;
+ int trans_val;
+ enum blas_trans_type trans_type = 0;
+ int m_i = 0;
+ int n_i = 0;
+ int max_mn; /* the max of m and n */
+ int ku;
+ int kl;
+ int lda_val;
+ int lda = 0;
+ int saved_seed; /* for saving the original seed */
+
+ /* use for counting the number of testgen calls * 2 */
+ int count, old_count = -1;
+
+ FPU_FIX_DECL;
+
+ /* test for bad arguments */
+ if (n < 0 || m < 0 || ntests < 0)
+ BLAS_error(fname, 0, 0, NULL);
+
+ /* initialization */
+ *min_ratio = 0.0;
+ *num_bad_ratio = 0;
+ *num_tests = 0;
+
+ saved_seed = *seed;
+ ratio_min = 1e308;
+ ratio_max = 0.0;
+ ratio = 0.0;
+ tot_tests = 0;
+ p_count = 0;
+ count = 0;
+ find_max_ratio = 0;
+ if (debug == 3)
+ find_max_ratio = 1;
+ max_mn = MAX(m, n);
+
+ if (m == 0 || n == 0) {
+ return 0.0;
+ }
+
+ FPU_FIX_START;
+
+ incx_gen = incy_gen = 1;
+
+ incy_gen *= 2;
+
+ /* get space for calculation */
+ x_head = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_head == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail = (double *) blas_malloc(max_mn * 2 * sizeof(double));
+ if (max_mn * 2 > 0 && x_tail == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y = (double *) blas_malloc(max_mn * 2 * sizeof(double) * 2);
+ if (max_mn * 2 > 0 && y == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_head_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_head_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ x_tail_gen = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && x_tail_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ y_gen = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && y_gen == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ temp = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && temp == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ head_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ tail_r_true = (double *) blas_malloc(max_mn * sizeof(double) * 2);
+ if (max_mn > 0 && (head_r_true == NULL || tail_r_true == NULL)) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+ ratios = (double *) blas_malloc(max_mn * sizeof(double));
+ if (max_mn > 0 && ratios == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ AB =
+ (double *) blas_malloc((m - 1 + n - 1 + 1) * max_mn * 2 * sizeof(double));
+ if ((m - 1 + n - 1 + 1) * max_mn * 2 > 0 && AB == NULL) {
+ BLAS_error("blas_malloc", 0, 0, "malloc failed.\n");
+ }
+
+ /* The debug iteration:
+ If debug=1, then will execute the iteration twice. First, compute the
+ max ratio. Second, print info if ratio > (50% * ratio_max). */
+ for (d_count = 0; d_count <= find_max_ratio; d_count++) {
+ bad_ratios = 0; /* set to zero */
+
+ if ((debug == 3) && (d_count == find_max_ratio))
+ *seed = saved_seed; /* restore the original seed */
+
+ /* varying alpha */
+ for (alpha_val = 0; alpha_val < 3; alpha_val++) {
+ alpha_flag = 0;
+ switch (alpha_val) {
+ case 0:
+ alpha[0] = alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ case 1:
+ alpha[0] = 1.0;
+ alpha[1] = 0.0;
+ alpha_flag = 1;
+ break;
+ }
+
+ /* varying beta */
+ for (beta_val = 0; beta_val < 3; beta_val++) {
+ beta_flag = 0;
+ switch (beta_val) {
+ case 0:
+ beta[0] = beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ case 1:
+ beta[0] = 1.0;
+ beta[1] = 0.0;
+ beta_flag = 1;
+ break;
+ }
+
+
+ /* varying extra precs */
+ for (prec_val = 0; prec_val <= 2; prec_val++) {
+ switch (prec_val) {
+ case 0:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 1:
+ eps_int = power(2, -BITS_D);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_double),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_double));
+ prec = blas_prec_double;
+ break;
+ case 2:
+ default:
+ eps_int = power(2, -BITS_E);
+ un_int = pow((double) BLAS_fpinfo_x(blas_base, blas_prec_extra),
+ (double) BLAS_fpinfo_x(blas_emin, blas_prec_extra));
+ prec = blas_prec_extra;
+ break;
+ }
+
+ /* values near underflow, 1, or overflow */
+ for (norm = -1; norm <= 1; norm++) {
+
+ /* number of tests */
+ for (i = 0; i < ntests; i++) {
+
+ /* row or col major */
+ for (order_val = 0; order_val < 2; order_val++) {
+ switch (order_val) {
+ case 0:
+ order_type = blas_rowmajor;
+ break;
+ case 1:
+ order_type = blas_colmajor;
+ break;
+ }
+
+ /* no_trans, trans, or conj_trans */
+ for (trans_val = 0; trans_val < 3; trans_val++) {
+ switch (trans_val) {
+ case 0:
+ trans_type = blas_no_trans;
+ m_i = m;
+ n_i = n;
+ break;
+ case 1:
+ trans_type = blas_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ case 2:
+ trans_type = blas_conj_trans;
+ m_i = n;
+ n_i = m;
+ break;
+ }
+
+ /* ku from 0 to n-1 */
+ for (ku = 0; ku < n; ku++) {
+ if (ku == n && ku != 0)
+ continue; /* the purpose of doing this is
+ to test for ku=0 */
+
+ /* kl from 0 to m-1 */
+ for (kl = 0; kl < m; kl++) {
+ if (kl == n && kl != 0)
+ continue; /* the purpose of doing this is
+ to test for kl=0 */
+
+ /* lda=ku+kl+1, ku+kl+2, 2*(ku+kl+1) */
+ for (lda_val = 0; lda_val < 3; lda_val++) {
+ switch (lda_val) {
+ case 0:
+ lda = ku + kl + 1;
+ break;
+ case 1:
+ lda = ku + kl + 2;
+ break;
+ case 2:
+ lda = 2 * (ku + kl + 1);
+ break;
+ }
+
+ if ((order_type == blas_rowmajor && lda < n) ||
+ (order_type == blas_colmajor && lda < m))
+ continue;
+
+ /* For the sake of speed, we throw out this case at random */
+ if (xrand(seed) >= test_prob)
+ continue;
+
+ /* in the trivial cases, no need to run testgen */
+ if (m > 0 && n > 0)
+ BLAS_zgbmv2_d_d_testgen(norm, order_type,
+ trans_type, m, n, kl, ku,
+ &alpha, alpha_flag, AB, lda,
+ x_head_gen, x_tail_gen,
+ &beta, beta_flag, y_gen,
+ seed, head_r_true,
+ tail_r_true);
+ count++;
+
+ /* varying incx */
+ for (incx_val = -2; incx_val <= 2; incx_val++) {
+ if (incx_val == 0)
+ continue;
+
+ /* setting incx */
+ incx = incx_val;
+
+
+ dcopy_vector(x_head_gen, n_i, 1, x_head, incx_val);
+ dcopy_vector(x_tail_gen, n_i, 1, x_tail, incx_val);
+
+ /* varying incy */
+ for (incy_val = -2; incy_val <= 2; incy_val++) {
+ if (incy_val == 0)
+ continue;
+
+ /* setting incy */
+ incy = incy_val;
+ incy *= 2;
+
+ zcopy_vector(y_gen, m_i, 1, y, incy_val);
+
+ /* call BLAS_zgbmv2_d_d_x */
+ FPU_FIX_STOP;
+ BLAS_zgbmv2_d_d_x(order_type, trans_type, m, n,
+ kl, ku, alpha, AB, lda, x_head,
+ x_tail, incx_val, beta, y,
+ incy_val, prec);
+ FPU_FIX_START;
+
+ /* set y starting index */
+ iy = 0;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ /* computing the ratio */
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of AB to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl, ku,
+ AB, lda, temp, j);
+
+ test_BLAS_zdot2_d_d(n_i, blas_no_conj, alpha,
+ beta, &y_gen[j * incy_gen],
+ &y[iy],
+ &head_r_true[j * incy_gen],
+ &tail_r_true[j * incy_gen],
+ temp, 1, x_head, x_tail,
+ incx_val, eps_int, un_int,
+ &ratios[j]);
+
+ /* take the max ratio */
+ if (j == 0) {
+ ratio = ratios[0];
+ /* The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ } else if (!(ratios[j] <= ratio)) {
+ ratio = ratios[j];
+ }
+
+ iy += incy;
+ }
+
+ /* Increase the number of bad ratio, if the ratio
+ is bigger than the threshold.
+ The !<= below causes NaN error to be detected.
+ Note that (NaN > thresh) is always false. */
+ if (!(ratio <= thresh)) {
+ bad_ratios++;
+
+ if ((debug == 3) && /* print only when debug is on */
+ (count != old_count) && /* print if old vector is different
+ from the current one */
+ (d_count == find_max_ratio) &&
+ (p_count <= max_print) &&
+ (ratio > 0.5 * ratio_max)) {
+ old_count = count;
+ printf
+ ("FAIL> %s: m = %d, n = %d, ntests = %d, threshold = %4.2f,\n",
+ fname, m, n, ntests, thresh);
+
+ /* Print test info */
+ switch (prec) {
+ case blas_prec_single:
+ printf("single ");
+ break;
+ case blas_prec_double:
+ printf("double ");
+ break;
+ case blas_prec_indigenous:
+ printf("indigenous ");
+ break;
+ case blas_prec_extra:
+ printf("extra ");
+ break;
+ }
+ switch (norm) {
+ case -1:
+ printf("near_underflow ");
+ break;
+ case 0:
+ printf("near_one ");
+ break;
+ case 1:
+ printf("near_overflow ");
+ break;
+ }
+ switch (order_type) {
+ case blas_rowmajor:
+ printf("row_major ");
+ break;
+ case blas_colmajor:
+ printf("col_major ");
+ break;
+ }
+ switch (trans_type) {
+ case blas_no_trans:
+ printf("no_trans ");
+ break;
+ case blas_trans:
+ printf("trans ");
+ break;
+ case blas_conj_trans:
+ printf("conj_trans ");
+ break;
+ }
+
+ printf
+ ("ku=%d, kl=%d, lda=%d, incx=%d, incy=%d:\n",
+ ku, kl, lda, incx, incy);
+
+ ix = 0;
+ iy = 0;
+ if (incx < 0)
+ ix = -(n_i - 1) * incx;
+ if (incy < 0)
+ iy = -(m_i - 1) * incy;
+
+ printf(" A=");
+ for (j = 0; j < m_i; j++) {
+ /* copy row j of A to temp */
+ dgbmv_copy(order_type, trans_type, m, n, kl,
+ ku, AB, lda, temp, j);
+
+ if (j > 0)
+ printf(" ");
+ dprint_vector(temp, n_i, 1, NULL);
+ }
+
+ for (j = 0, k = 0; j < n_i || k < m_i;
+ j++, k++) {
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_head[ix]);
+ printf("\n");
+ }
+ if (j < n_i) {
+ printf(" ");
+ printf("%24.16e", x_tail[ix]);
+ printf("\n");
+ }
+ if (k < m_i) {
+ printf(" ");
+ printf("(%24.16e, %24.16e)",
+ y_gen[k * incy_gen],
+ y_gen[k * incy_gen + 1]);
+ printf("\n");
+ printf(" ");
+ printf("y_final[%d] = ", iy);
+ printf("(%24.16e, %24.16e)", y[iy],
+ y[iy + 1]);
+ printf("\n");
+ }
+ ix += incx;
+ iy += incy;
+ }
+
+ printf(" ");
+ printf("alpha = ");
+ printf("(%24.16e, %24.16e)", alpha[0],
+ alpha[1]);
+ printf("\n ");
+ printf("beta = ");
+ printf("(%24.16e, %24.16e)", beta[0],
+ beta[1]);
+ printf("\n");
+ for (j = 0; j < m_i; j++) {
+ printf(" ");
+ printf
+ ("([%24.16e %24.16e], [%24.16e %24.16e])",
+ head_r_true[j * incy_gen],
+ tail_r_true[j * incy_gen],
+ head_r_true[j * incy_gen + 1],
+ tail_r_true[j * incy_gen + 1]);
+ printf(", ratio[%d]=%.4e\n", j, ratios[j]);
+ }
+
+ printf(" ratio=%.4e\n", ratio);
+ p_count++;
+ }
+ if (bad_ratios >= MAX_BAD_TESTS) {
+ printf("\ntoo many failures, exiting....");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ if (!(ratio <= TOTAL_FAILURE_THRESHOLD)) {
+ printf("\nFlagrant ratio error, exiting...");
+ printf("\nTesting and compilation");
+ printf(" are incomplete\n\n");
+ goto end;
+ }
+ }
+ if (d_count == 0) {
+ if (ratio > ratio_max)
+ ratio_max = ratio;
+
+ if (ratio != 0.0 && ratio < ratio_min)
+ ratio_min = ratio;
+ tot_tests++;
+ }
+ } /* incy */
+ } /* incx */
+ } /* lda */
+ } /* kl */
+ } /* ku */
+ } /* trans */
+ } /* order */
+ } /* tests */
+ } /* norm */
+ } /* prec */
+ } /* beta */
+ } /* alpha */
+ } /* debug */
+
+ if ((debug == 2) || ((debug == 1) && bad_ratios > 0)) {
+ printf(" %s: m = %d, n = %d, ntests = %d, thresh = %4.2f\n", fname,
+ m, n, ntests, thresh);
+ printf
+ (" bad/total = %d/%d=%3.2f, min_ratio = %.4e, max_ratio = %.4e\n\n",
+ bad_ratios, tot_tests, ((double) bad_ratios) / ((double) tot_tests),
+ ratio_min, ratio_max);
+ }
+
+end:
+ blas_free(x_head);
+ blas_free(x_tail);
+ blas_free(y);
+ blas_free(x_head_gen);
+ blas_free(x_tail_gen);
+ blas_free(y_gen);
+ blas_free(temp);
+ blas_free(AB);
+ blas_free(head_r_true);
+ blas_free(tail_r_true);
+ blas_free(ratios);
+
+ FPU_FIX_STOP;
+
+ *min_ratio = ratio_min;
+ *num_bad_ratio = bad_ratios;
+ *num_tests = tot_tests;
+ return ratio_max;
+}
+
+#define NUMPAIRS 12
+
+int main(int argc, char **argv)
+{
+ int nsizes, ntests, debug;
+ double thresh, test_prob;
+ double total_min_ratio, total_max_ratio;
+ int total_bad_ratios;
+ int seed, num_bad_ratio, num_tests;
+ int total_tests, nr_failed_routines = 0, nr_routines = 0;
+ double min_ratio, max_ratio;
+ const char *base_routine = "gbmv2";
+ char *fname;
+ int n;
+
+ int m, i;
+ int mn_pairs[NUMPAIRS][2] =
+ { {0, 0}, {1, 0}, {0, 1}, {1, 2}, {2, 1}, {1, 3},
+ {3, 1}, {2, 3}, {4, 3}, {2, 4}, {6, 6}, {10, 8}
+ };
+
+ if (argc != 6) {
+ printf("Usage:\n");
+ printf("do_test_gbmv2 <nsizes> <ntests> <thresh> <debug> <test_prob>\n");
+ printf(" <nsizes>: number of sizes to be run.\n");
+ printf
+ (" <ntests>: the number of tests performed for each set of attributes\n");
+ printf
+ (" <thresh>: to catch bad ratios if it is greater than <thresh>\n");
+ printf(" <debug>: 0, 1, 2, or 3; \n");
+ printf(" if 0, no printing \n");
+ printf(" if 1, print error summary only if tests fail\n");
+ printf(" if 2, print error summary for each n\n");
+ printf(" if 3, print complete info each test fails \n");
+ printf("<test_prob>: probability of preforming a given \n");
+ printf(" test case: 0.0 does no tests, 1.0 does all tests\n");
+ return -1;
+ } else {
+ nsizes = atoi(argv[1]);
+ ntests = atoi(argv[2]);
+ thresh = atof(argv[3]);
+ debug = atoi(argv[4]);
+ test_prob = atof(argv[5]);
+ }
+
+ seed = 1999;
+
+ if (nsizes < 0 || ntests < 0 || debug < 0 || debug > 3)
+ BLAS_error("Testing gbmv2", 0, 0, NULL);
+
+ printf("Testing %s...\n", base_routine);
+ printf("INPUT: nsizes = %d, ntests = %d, thresh = %4.2f, debug = %d\n\n",
+ nsizes, ntests, thresh, debug);
+
+
+
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_dgbmv2_d_s";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_dgbmv2_d_s(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_dgbmv2_s_d";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_dgbmv2_s_d(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_dgbmv2_s_s";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_dgbmv2_s_s(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_z_c";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_z_c(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_c_z";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_c_z(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_c_c";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_c_c(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_cgbmv2_c_s";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_cgbmv2_c_s(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_cgbmv2_s_c";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_cgbmv2_s_c(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_cgbmv2_s_s";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_cgbmv2_s_s(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_z_d";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_z_d(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_d_z";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_d_z(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_d_d";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_d_d(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_sgbmv2_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_sgbmv2_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_dgbmv2_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_dgbmv2_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_cgbmv2_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_cgbmv2_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_dgbmv2_d_s_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_dgbmv2_d_s_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_dgbmv2_s_d_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_dgbmv2_s_d_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_dgbmv2_s_s_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_dgbmv2_s_s_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_z_c_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_z_c_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_c_z_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_c_z_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_c_c_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_c_c_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_cgbmv2_c_s_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_cgbmv2_c_s_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_cgbmv2_s_c_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_cgbmv2_s_c_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_cgbmv2_s_s_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_cgbmv2_s_s_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_z_d_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_z_d_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_d_z_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_d_z_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+ min_ratio = 1e308;
+ max_ratio = 0.0;
+ total_bad_ratios = 0;
+ total_tests = 0;
+ fname = "BLAS_zgbmv2_d_d_x";
+ printf("Testing %s...\n", fname);
+ for (i = 0; i < nsizes; i++) {
+ m = mn_pairs[i][0];
+ n = mn_pairs[i][1];
+ total_max_ratio =
+ do_test_zgbmv2_d_d_x(m, n, 1, &seed, thresh, debug, test_prob,
+ &total_min_ratio, &num_bad_ratio, &num_tests);
+ if (total_max_ratio > max_ratio)
+ max_ratio = total_max_ratio;
+
+ if (total_min_ratio != 0.0 && total_min_ratio < min_ratio)
+ min_ratio = total_min_ratio;
+
+ total_bad_ratios += num_bad_ratio;
+ total_tests += num_tests;
+ }
+
+ if (total_bad_ratios == 0)
+ printf("PASS> ");
+ else {
+ nr_failed_routines++;
+ printf("FAIL> ");
+ }
+ nr_routines++;
+
+ if (min_ratio == 1e308)
+ min_ratio = 0.0;
+
+ printf("%-24s: bad/total = %d/%d, max_ratio = %.2e\n\n", fname,
+ total_bad_ratios, total_tests, max_ratio);
+
+
+ printf("\n");
+ if (nr_failed_routines)
+ printf("FAILED ");
+ else
+ printf("PASSED ");
+ printf("%-10s: FAIL/TOTAL = %d/%d\n",
+ base_routine, nr_failed_routines, nr_routines);
+
+ return 0;
+}