summaryrefslogtreecommitdiff
path: root/SRC
diff options
context:
space:
mode:
Diffstat (limited to 'SRC')
-rw-r--r--SRC/Makefile30
-rw-r--r--SRC/chb2st_kernels.f320
-rw-r--r--SRC/chbev_2stage.f386
-rw-r--r--SRC/chbevd_2stage.f458
-rw-r--r--SRC/chbevx_2stage.f646
-rw-r--r--SRC/cheev_2stage.f355
-rw-r--r--SRC/cheevd_2stage.f451
-rw-r--r--SRC/cheevr_2stage.f779
-rw-r--r--SRC/cheevx_2stage.f618
-rw-r--r--SRC/chegv_2stage.f379
-rw-r--r--SRC/chetrd_2stage.f337
-rw-r--r--SRC/chetrd_hb2st.F603
-rw-r--r--SRC/chetrd_he2hb.f517
-rw-r--r--SRC/clarfy.f163
-rw-r--r--SRC/dlarfy.f161
-rw-r--r--SRC/dsb2st_kernels.f320
-rw-r--r--SRC/dsbev_2stage.f377
-rw-r--r--SRC/dsbevd_2stage.f412
-rw-r--r--SRC/dsbevx_2stage.f633
-rw-r--r--SRC/dsyev_2stage.f348
-rw-r--r--SRC/dsyevd_2stage.f406
-rw-r--r--SRC/dsyevr_2stage.f740
-rw-r--r--SRC/dsyevx_2stage.f608
-rw-r--r--SRC/dsygv_2stage.f370
-rw-r--r--SRC/dsytrd_2stage.f337
-rw-r--r--SRC/dsytrd_sb2st.F603
-rw-r--r--SRC/dsytrd_sy2sb.f517
-rw-r--r--SRC/ilaenv.f10
-rw-r--r--SRC/iparam2stage.F388
-rw-r--r--SRC/slarfy.f161
-rw-r--r--SRC/ssb2st_kernels.f320
-rw-r--r--SRC/ssbev_2stage.f377
-rw-r--r--SRC/ssbevd_2stage.f412
-rw-r--r--SRC/ssbevx_2stage.f633
-rw-r--r--SRC/ssyev_2stage.f348
-rw-r--r--SRC/ssyevd_2stage.f406
-rw-r--r--SRC/ssyevr_2stage.f745
-rw-r--r--SRC/ssyevx_2stage.f608
-rw-r--r--SRC/ssygv_2stage.f371
-rw-r--r--SRC/ssytrd_2stage.f337
-rw-r--r--SRC/ssytrd_sb2st.F603
-rw-r--r--SRC/ssytrd_sy2sb.f517
-rw-r--r--SRC/zhb2st_kernels.f320
-rw-r--r--SRC/zhbev_2stage.f386
-rw-r--r--SRC/zhbevd_2stage.f458
-rw-r--r--SRC/zhbevx_2stage.f646
-rw-r--r--SRC/zheev_2stage.f355
-rw-r--r--SRC/zheevd_2stage.f451
-rw-r--r--SRC/zheevr_2stage.f779
-rw-r--r--SRC/zheevx_2stage.f618
-rw-r--r--SRC/zhegv_2stage.f379
-rw-r--r--SRC/zhetrd_2stage.f337
-rw-r--r--SRC/zhetrd_hb2st.F603
-rw-r--r--SRC/zhetrd_he2hb.f517
-rw-r--r--SRC/zlarfy.f163
55 files changed, 24112 insertions, 10 deletions
diff --git a/SRC/Makefile b/SRC/Makefile
index 01cf7021..c521d7f6 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -56,7 +56,7 @@ include ../make.inc
#
#######################################################################
-ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o xerbla_array.o iparmq.o \
+ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o xerbla_array.o iparmq.o iparam2stage.o \
ilaprec.o ilatrans.o ilauplo.o iladiag.o chla_transtype.o \
../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o
@@ -120,7 +120,7 @@ SLASRC = \
slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \
slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
- slarf.o slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slargv.o \
+ slarf.o slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
slarrv.o slartv.o \
slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \
slasyf_rk.o \
@@ -167,7 +167,10 @@ SLASRC = \
sgelqt.o sgelqt3.o sgemlqt.o \
sgetsls.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \
sgelq.o slaswlq.o slamswlq.o sgemlq.o \
- stplqt.o stplqt2.o stpmlqt.o
+ stplqt.o stplqt2.o stpmlqt.o \
+ ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \
+ ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \
+ ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o
DSLASRC = spotrs.o sgetrs.o spotrf.o sgetrf.o
@@ -224,7 +227,7 @@ CLASRC = \
claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
clarf.o clarfb.o clarfg.o clarft.o clarfgp.o \
- clarfx.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \
+ clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \
clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \
claswp.o clasyf.o clasyf_rook.o clasyf_rk.o \
clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrz.o \
@@ -263,7 +266,10 @@ CLASRC = \
cgelqt.o cgelqt3.o cgemlqt.o \
cgetsls.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \
cgelq.o claswlq.o clamswlq.o cgemlq.o \
- ctplqt.o ctplqt2.o ctpmlqt.o
+ ctplqt.o ctplqt2.o ctpmlqt.o \
+ chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \
+ cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \
+ chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o
ifdef USEXBLAS
CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \
@@ -306,7 +312,7 @@ DLASRC = \
dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \
dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
- dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o \
+ dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
dlargv.o dlarrv.o dlartv.o \
dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \
dlasyf.o dlasyf_rook.o dlasyf_rk.o \
@@ -355,7 +361,10 @@ DLASRC = \
dgelqt.o dgelqt3.o dgemlqt.o \
dgetsls.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \
dgelq.o dlaswlq.o dlamswlq.o dgemlq.o \
- dtplqt.o dtplqt2.o dtpmlqt.o
+ dtplqt.o dtplqt2.o dtpmlqt.o \
+ dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \
+ dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \
+ dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o
ifdef USEXBLAS
DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o \
@@ -413,7 +422,7 @@ ZLASRC = \
zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
zlarcm.o zlarf.o zlarfb.o \
zlarfg.o zlarft.o zlarfgp.o \
- zlarfx.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \
+ zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \
zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \
zlassq.o zlaswp.o zlasyf.o zlasyf_rook.o zlasyf_rk.o \
zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlauu2.o \
@@ -455,7 +464,10 @@ ZLASRC = \
zgelqt.o zgelqt3.o zgemlqt.o \
zgetsls.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \
zgelq.o zlaswlq.o zlamswlq.o zgemlq.o \
- ztplqt.o ztplqt2.o ztpmlqt.o
+ ztplqt.o ztplqt2.o ztpmlqt.o \
+ zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \
+ zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \
+ zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o
ifdef USEXBLAS
ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o \
diff --git a/SRC/chb2st_kernels.f b/SRC/chb2st_kernels.f
new file mode 100644
index 00000000..8b0a4b28
--- /dev/null
+++ b/SRC/chb2st_kernels.f
@@ -0,0 +1,320 @@
+*> \brief \b CHB2ST_KERNELS
+*
+* @generated from zhb2st_kernels.f, fortran z -> c, Sun Nov 6 19:34:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHB2ST_KERNELS + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chb2st_kernels.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chb2st_kernels.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chb2st_kernels.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CHB2ST_KERNELS( UPLO, WANTZ, TTYPE,
+* ST, ED, SWEEP, N, NB, IB,
+* A, LDA, V, TAU, LDVT, WORK)
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER UPLO
+* LOGICAL WANTZ
+* INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+* ..
+* .. Array Arguments ..
+* COMPLEX A( LDA, * ), V( * ),
+* TAU( * ), WORK( * )
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CHB2ST_KERNELS is an internal routine used by the CHETRD_HB2ST
+*> subroutine.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> @param[in] n
+*> The order of the matrix A.
+*>
+*> @param[in] nb
+*> The size of the band.
+*>
+*> @param[in, out] A
+*> A pointer to the matrix A.
+*>
+*> @param[in] lda
+*> The leading dimension of the matrix A.
+*>
+*> @param[out] V
+*> COMPLEX array, dimension 2*n if eigenvalues only are
+*> requested or to be queried for vectors.
+*>
+*> @param[out] TAU
+*> COMPLEX array, dimension (2*n).
+*> The scalar factors of the Householder reflectors are stored
+*> in this array.
+*>
+*> @param[in] st
+*> internal parameter for indices.
+*>
+*> @param[in] ed
+*> internal parameter for indices.
+*>
+*> @param[in] sweep
+*> internal parameter for indices.
+*>
+*> @param[in] Vblksiz
+*> internal parameter for indices.
+*>
+*> @param[in] wantz
+*> logical which indicate if Eigenvalue are requested or both
+*> Eigenvalue/Eigenvectors.
+*>
+*> @param[in] work
+*> Workspace of size nb.
+*>
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE CHB2ST_KERNELS( UPLO, WANTZ, TTYPE,
+ $ ST, ED, SWEEP, N, NB, IB,
+ $ A, LDA, V, TAU, LDVT, WORK)
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER UPLO
+ LOGICAL WANTZ
+ INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+* ..
+* .. Array Arguments ..
+ COMPLEX A( LDA, * ), V( * ),
+ $ TAU( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ COMPLEX ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ),
+ $ ONE = ( 1.0E+0, 0.0E+0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL UPPER
+ INTEGER I, J1, J2, LM, LN, VPOS, TAUPOS,
+ $ DPOS, OFDPOS, AJETER
+ COMPLEX CTMP
+* ..
+* .. External Subroutines ..
+ EXTERNAL CLARFG, CLARFX, CLARFY
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC CONJG, MOD
+* .. External Functions ..
+ LOGICAL LSAME
+ EXTERNAL LSAME
+* ..
+* ..
+* .. Executable Statements ..
+*
+ AJETER = IB + LDVT
+ UPPER = LSAME( UPLO, 'U' )
+
+ IF( UPPER ) THEN
+ DPOS = 2 * NB + 1
+ OFDPOS = 2 * NB
+ ELSE
+ DPOS = 1
+ OFDPOS = 2
+ ENDIF
+
+*
+* Upper case
+*
+ IF( UPPER ) THEN
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ENDIF
+ GO TO ( 101, 102, 103 ) TTYPE
+*
+ 101 CONTINUE
+ LM = ED - ST + 1
+*
+ V( VPOS ) = ONE
+ DO 10 I = 1, LM-1
+ V( VPOS+I ) = CONJG( A( OFDPOS-I, ST+I ) )
+ A( OFDPOS-I, ST+I ) = ZERO
+ 10 CONTINUE
+ CTMP = CONJG( A( OFDPOS, ST ) )
+ CALL CLARFG( LM, CTMP, V( VPOS+1 ), 1,
+ $ TAU( TAUPOS ) )
+ A( OFDPOS, ST ) = CTMP
+*
+ 103 CONTINUE
+ LM = ED - ST + 1
+ CALL CLARFY( UPLO, LM, V( VPOS ), 1, CONJG( TAU( TAUPOS ) ),
+ $ A( DPOS, ST ), LDA-1, WORK)
+ GOTO 300
+*
+ 102 CONTINUE
+ J1 = ED+1
+ J2 = MIN( ED+NB, N )
+ LN = ED-ST+1
+ LM = J2-J1+1
+ IF( LM.GT.0) THEN
+ CALL CLARFX( 'Left', LN, LM, V( VPOS ),
+ $ CONJG( TAU( TAUPOS ) ), A( DPOS-NB, J1 ),
+ $ LDA-1, WORK)
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ENDIF
+*
+ V( VPOS ) = ONE
+ DO 30 I = 1, LM-1
+ V( VPOS+I ) = CONJG( A( DPOS-NB-I, J1+I ) )
+ A( DPOS-NB-I, J1+I ) = ZERO
+ 30 CONTINUE
+ CTMP = CONJG( A( DPOS-NB, J1 ) )
+ CALL CLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) )
+ A( DPOS-NB, J1 ) = CTMP
+*
+ CALL CLARFX( 'Right', LN-1, LM, V( VPOS ),
+ $ TAU( TAUPOS ),
+ $ A( DPOS-NB+1, J1 ), LDA-1, WORK)
+ ENDIF
+ GOTO 300
+*
+* Lower case
+*
+ ELSE
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ENDIF
+ GO TO ( 201, 202, 203 ) TTYPE
+*
+ 201 CONTINUE
+ LM = ED - ST + 1
+*
+ V( VPOS ) = ONE
+ DO 20 I = 1, LM-1
+ V( VPOS+I ) = A( OFDPOS+I, ST-1 )
+ A( OFDPOS+I, ST-1 ) = ZERO
+ 20 CONTINUE
+ CALL CLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1,
+ $ TAU( TAUPOS ) )
+*
+ 203 CONTINUE
+ LM = ED - ST + 1
+*
+ CALL CLARFY( UPLO, LM, V( VPOS ), 1, CONJG( TAU( TAUPOS ) ),
+ $ A( DPOS, ST ), LDA-1, WORK)
+
+ GOTO 300
+*
+ 202 CONTINUE
+ J1 = ED+1
+ J2 = MIN( ED+NB, N )
+ LN = ED-ST+1
+ LM = J2-J1+1
+*
+ IF( LM.GT.0) THEN
+ CALL CLARFX( 'Right', LM, LN, V( VPOS ),
+ $ TAU( TAUPOS ), A( DPOS+NB, ST ),
+ $ LDA-1, WORK)
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ENDIF
+*
+ V( VPOS ) = ONE
+ DO 40 I = 1, LM-1
+ V( VPOS+I ) = A( DPOS+NB+I, ST )
+ A( DPOS+NB+I, ST ) = ZERO
+ 40 CONTINUE
+ CALL CLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1,
+ $ TAU( TAUPOS ) )
+*
+ CALL CLARFX( 'Left', LM, LN-1, V( VPOS ),
+ $ CONJG( TAU( TAUPOS ) ),
+ $ A( DPOS+NB-1, ST+1 ), LDA-1, WORK)
+
+ ENDIF
+ GOTO 300
+ ENDIF
+
+ 300 CONTINUE
+ RETURN
+*
+* END OF CHB2ST_KERNELS
+*
+ END
diff --git a/SRC/chbev_2stage.f b/SRC/chbev_2stage.f
new file mode 100644
index 00000000..182d3d93
--- /dev/null
+++ b/SRC/chbev_2stage.f
@@ -0,0 +1,386 @@
+*> \brief <b> CHBEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+* @generated from zhbev_2stage.f, fortran z -> c, Sat Nov 5 23:18:20 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHBEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbev_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbev_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbev_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CHBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+* WORK, LWORK, RWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, KD, LDAB, LDZ, N, LWORK
+* ..
+* .. Array Arguments ..
+* REAL RWORK( * ), W( * )
+* COMPLEX AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CHBEV_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a complex Hermitian band matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is COMPLEX array, dimension (LDAB, N)
+*> On entry, the upper or lower triangle of the Hermitian band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*> On exit, AB is overwritten by values generated during the
+*> reduction to tridiagonal form. If UPLO = 'U', the first
+*> superdiagonal and the diagonal of the tridiagonal matrix T
+*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*> the diagonal and first subdiagonal of T are returned in the
+*> first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is COMPLEX array, dimension (LDZ, N)
+*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*> eigenvectors of the matrix A, with the i-th column of Z
+*> holding the eigenvector associated with W(i).
+*> If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX array, dimension LWORK
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS
+*> where KD is the size of the band.
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK, RWORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is REAL array, dimension (max(1,3*N-2))
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit.
+*> < 0: if INFO = -i, the i-th argument had an illegal value.
+*> > 0: if INFO = i, the algorithm failed to converge; i
+*> off-diagonal elements of an intermediate tridiagonal
+*> form did not converge to zero.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexOTHEReigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE CHBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+ $ WORK, LWORK, RWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, KD, LDAB, LDZ, N, LWORK
+* ..
+* .. Array Arguments ..
+ REAL RWORK( * ), W( * )
+ COMPLEX AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, WANTZ, LQUERY
+ INTEGER IINFO, IMAX, INDE, INDWRK, INDRWK, ISCALE,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS
+ REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, CLANHB
+ EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR
+ $ CHETRD_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC REAL, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDAB.LT.KD+1 ) THEN
+ INFO = -6
+ ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -9
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ WORK( 1 ) = LWMIN
+ ELSE
+ IB = ILAENV( 18, 'CHETRD_HB2ST', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+ LWMIN = LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+ ENDIF
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -11
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHBEV_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ W( 1 ) = REAL( AB( 1, 1 ) )
+ ELSE
+ W( 1 ) = REAL( AB( KD+1, 1 ) )
+ END IF
+ IF( WANTZ )
+ $ Z( 1, 1 ) = ONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = CLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ CALL CLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ ELSE
+ CALL CLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ END IF
+ END IF
+*
+* Call CHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form.
+*
+ INDE = 1
+ INDHOUS = 1
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL CHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+ $ RWORK( INDE ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call SSTERF. For eigenvectors, call CSTEQR.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL SSTERF( N, W, RWORK( INDE ), INFO )
+ ELSE
+ INDRWK = INDE + N
+ CALL CSTEQR( JOBZ, N, W, RWORK( INDE ), Z, LDZ,
+ $ RWORK( INDRWK ), INFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = N
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of CHBEV_2STAGE
+*
+ END
diff --git a/SRC/chbevd_2stage.f b/SRC/chbevd_2stage.f
new file mode 100644
index 00000000..89c118d3
--- /dev/null
+++ b/SRC/chbevd_2stage.f
@@ -0,0 +1,458 @@
+*> \brief <b> CHBEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+* @generated from zhbevd_2stage.f, fortran z -> c, Sat Nov 5 23:18:17 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHBEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CHBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+* WORK, LWORK, RWORK, LRWORK, IWORK,
+* LIWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, KD, LDAB, LDZ, LIWORK, LRWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+* INTEGER IWORK( * )
+* REAL RWORK( * ), W( * )
+* COMPLEX AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CHBEVD_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a complex Hermitian band matrix A using the 2stage technique for
+*> the reduction to tridiagonal. If eigenvectors are desired, it
+*> uses a divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is COMPLEX array, dimension (LDAB, N)
+*> On entry, the upper or lower triangle of the Hermitian band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*> On exit, AB is overwritten by values generated during the
+*> reduction to tridiagonal form. If UPLO = 'U', the first
+*> superdiagonal and the diagonal of the tridiagonal matrix T
+*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*> the diagonal and first subdiagonal of T are returned in the
+*> first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is COMPLEX array, dimension (LDZ, N)
+*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*> eigenvectors of the matrix A, with the i-th column of Z
+*> holding the eigenvector associated with W(i).
+*> If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS
+*> where KD is the size of the band.
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK, RWORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is REAL array,
+*> dimension (LRWORK)
+*> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*> LRWORK is INTEGER
+*> The dimension of array RWORK.
+*> If N <= 1, LRWORK must be at least 1.
+*> If JOBZ = 'N' and N > 1, LRWORK must be at least N.
+*> If JOBZ = 'V' and N > 1, LRWORK must be at least
+*> 1 + 5*N + 2*N**2.
+*>
+*> If LRWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK, RWORK
+*> and IWORK arrays, returns these values as the first entries
+*> of the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*> LIWORK is INTEGER
+*> The dimension of array IWORK.
+*> If JOBZ = 'N' or N <= 1, LIWORK must be at least 1.
+*> If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N .
+*>
+*> If LIWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK, RWORK
+*> and IWORK arrays, returns these values as the first entries
+*> of the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit.
+*> < 0: if INFO = -i, the i-th argument had an illegal value.
+*> > 0: if INFO = i, the algorithm failed to converge; i
+*> off-diagonal elements of an intermediate tridiagonal
+*> form did not converge to zero.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexOTHEReigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE CHBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+ $ WORK, LWORK, RWORK, LRWORK, IWORK,
+ $ LIWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, KD, LDAB, LDZ, LIWORK, LRWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+ INTEGER IWORK( * )
+ REAL RWORK( * ), W( * )
+ COMPLEX AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+ COMPLEX CZERO, CONE
+ PARAMETER ( CZERO = ( 0.0E0, 0.0E0 ),
+ $ CONE = ( 1.0E0, 0.0E0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, LQUERY, WANTZ
+ INTEGER IINFO, IMAX, INDE, INDWK2, INDRWK, ISCALE,
+ $ LLWORK, INDWK, LHTRD, LWTRD, IB, INDHOUS,
+ $ LIWMIN, LLRWK, LLWK2, LRWMIN, LWMIN
+ REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, CLANHB
+ EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL SSCAL, SSTERF, XERBLA, CGEMM, CLACPY,
+ $ CLASCL, CSTEDC, CHETRD_HB2ST
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC REAL, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 .OR. LRWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ LRWMIN = 1
+ LIWMIN = 1
+ ELSE
+ IB = ILAENV( 18, 'CHETRD_HB2ST', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+ IF( WANTZ ) THEN
+ LWMIN = 2*N**2
+ LRWMIN = 1 + 5*N + 2*N**2
+ LIWMIN = 3 + 5*N
+ ELSE
+ LWMIN = MAX( N, LHTRD + LWTRD )
+ LRWMIN = N
+ LIWMIN = 1
+ END IF
+ END IF
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDAB.LT.KD+1 ) THEN
+ INFO = -6
+ ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -9
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ WORK( 1 ) = LWMIN
+ RWORK( 1 ) = LRWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -11
+ ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -13
+ ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -15
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHBEVD_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ W( 1 ) = REAL( AB( 1, 1 ) )
+ IF( WANTZ )
+ $ Z( 1, 1 ) = CONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = CLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ CALL CLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ ELSE
+ CALL CLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ END IF
+ END IF
+*
+* Call CHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form.
+*
+ INDE = 1
+ INDRWK = INDE + N
+ LLRWK = LRWORK - INDRWK + 1
+ INDHOUS = 1
+ INDWK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWK + 1
+ INDWK2 = INDWK + N*N
+ LLWK2 = LWORK - INDWK2 + 1
+*
+ CALL CHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+ $ RWORK( INDE ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call SSTERF. For eigenvectors, call CSTEDC.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL SSTERF( N, W, RWORK( INDE ), INFO )
+ ELSE
+ CALL CSTEDC( 'I', N, W, RWORK( INDE ), WORK, N, WORK( INDWK2 ),
+ $ LLWK2, RWORK( INDRWK ), LLRWK, IWORK, LIWORK,
+ $ INFO )
+ CALL CGEMM( 'N', 'N', N, N, N, CONE, Z, LDZ, WORK, N, CZERO,
+ $ WORK( INDWK2 ), N )
+ CALL CLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = N
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+ WORK( 1 ) = LWMIN
+ RWORK( 1 ) = LRWMIN
+ IWORK( 1 ) = LIWMIN
+ RETURN
+*
+* End of CHBEVD_2STAGE
+*
+ END
diff --git a/SRC/chbevx_2stage.f b/SRC/chbevx_2stage.f
new file mode 100644
index 00000000..07eb6153
--- /dev/null
+++ b/SRC/chbevx_2stage.f
@@ -0,0 +1,646 @@
+*> \brief <b> CHBEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+* @generated from zhbevx_2stage.f, fortran z -> c, Sat Nov 5 23:18:22 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHBEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB,
+* Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W,
+* Z, LDZ, WORK, LWORK, RWORK, IWORK,
+* IFAIL, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, RANGE, UPLO
+* INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+* REAL ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+* INTEGER IFAIL( * ), IWORK( * )
+* REAL RWORK( * ), W( * )
+* COMPLEX AB( LDAB, * ), Q( LDQ, * ), WORK( * ),
+* $ Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CHBEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a complex Hermitian band matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors
+*> can be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*> RANGE is CHARACTER*1
+*> = 'A': all eigenvalues will be found;
+*> = 'V': all eigenvalues in the half-open interval (VL,VU]
+*> will be found;
+*> = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is COMPLEX array, dimension (LDAB, N)
+*> On entry, the upper or lower triangle of the Hermitian band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*> On exit, AB is overwritten by values generated during the
+*> reduction to tridiagonal form.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] Q
+*> \verbatim
+*> Q is COMPLEX array, dimension (LDQ, N)
+*> If JOBZ = 'V', the N-by-N unitary matrix used in the
+*> reduction to tridiagonal form.
+*> If JOBZ = 'N', the array Q is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDQ
+*> \verbatim
+*> LDQ is INTEGER
+*> The leading dimension of the array Q. If JOBZ = 'V', then
+*> LDQ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*> VL is REAL
+*> If RANGE='V', the lower bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*> VU is REAL
+*> If RANGE='V', the upper bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*> IL is INTEGER
+*> If RANGE='I', the index of the
+*> smallest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*> IU is INTEGER
+*> If RANGE='I', the index of the
+*> largest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*> ABSTOL is REAL
+*> The absolute error tolerance for the eigenvalues.
+*> An approximate eigenvalue is accepted as converged
+*> when it is determined to lie in an interval [a,b]
+*> of width less than or equal to
+*>
+*> ABSTOL + EPS * max( |a|,|b| ) ,
+*>
+*> where EPS is the machine precision. If ABSTOL is less than
+*> or equal to zero, then EPS*|T| will be used in its place,
+*> where |T| is the 1-norm of the tridiagonal matrix obtained
+*> by reducing AB to tridiagonal form.
+*>
+*> Eigenvalues will be computed most accurately when ABSTOL is
+*> set to twice the underflow threshold 2*SLAMCH('S'), not zero.
+*> If this routine returns with INFO>0, indicating that some
+*> eigenvectors did not converge, try setting ABSTOL to
+*> 2*SLAMCH('S').
+*>
+*> See "Computing Small Singular Values of Bidiagonal Matrices
+*> with Guaranteed High Relative Accuracy," by Demmel and
+*> Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*> M is INTEGER
+*> The total number of eigenvalues found. 0 <= M <= N.
+*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> The first M elements contain the selected eigenvalues in
+*> ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is COMPLEX array, dimension (LDZ, max(1,M))
+*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*> contain the orthonormal eigenvectors of the matrix A
+*> corresponding to the selected eigenvalues, with the i-th
+*> column of Z holding the eigenvector associated with W(i).
+*> If an eigenvector fails to converge, then that column of Z
+*> contains the latest approximation to the eigenvector, and the
+*> index of the eigenvector is returned in IFAIL.
+*> If JOBZ = 'N', then Z is not referenced.
+*> Note: the user must ensure that at least max(1,M) columns are
+*> supplied in the array Z; if RANGE = 'V', the exact value of M
+*> is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX array, dimension (LWORK)
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS
+*> where KD is the size of the band.
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK, RWORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is REAL array, dimension (7*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*> IFAIL is INTEGER array, dimension (N)
+*> If JOBZ = 'V', then if INFO = 0, the first M elements of
+*> IFAIL are zero. If INFO > 0, then IFAIL contains the
+*> indices of the eigenvectors that failed to converge.
+*> If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, then i eigenvectors failed to converge.
+*> Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup complexOTHEReigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB,
+ $ Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W,
+ $ Z, LDZ, WORK, LWORK, RWORK, IWORK,
+ $ IFAIL, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, RANGE, UPLO
+ INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+ REAL ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+ INTEGER IFAIL( * ), IWORK( * )
+ REAL RWORK( * ), W( * )
+ COMPLEX AB( LDAB, * ), Q( LDQ, * ), WORK( * ),
+ $ Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+ COMPLEX CZERO, CONE
+ PARAMETER ( CZERO = ( 0.0E0, 0.0E0 ),
+ $ CONE = ( 1.0E0, 0.0E0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL ALLEIG, INDEIG, LOWER, TEST, VALEIG, WANTZ,
+ $ LQUERY
+ CHARACTER ORDER
+ INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+ $ INDISP, INDIWK, INDRWK, INDWRK, ISCALE, ITMP1,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS,
+ $ J, JJ, NSPLIT
+ REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+ $ SIGMA, SMLNUM, TMP1, VLL, VUU
+ COMPLEX CTMP1
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, CLANHB
+ EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CCOPY,
+ $ CGEMV, CLACPY, CLASCL, CSTEIN, CSTEQR,
+ $ CSWAP, CHETRD_HB2ST
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC REAL, MAX, MIN, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ ALLEIG = LSAME( RANGE, 'A' )
+ VALEIG = LSAME( RANGE, 'V' )
+ INDEIG = LSAME( RANGE, 'I' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -5
+ ELSE IF( LDAB.LT.KD+1 ) THEN
+ INFO = -7
+ ELSE IF( WANTZ .AND. LDQ.LT.MAX( 1, N ) ) THEN
+ INFO = -9
+ ELSE
+ IF( VALEIG ) THEN
+ IF( N.GT.0 .AND. VU.LE.VL )
+ $ INFO = -11
+ ELSE IF( INDEIG ) THEN
+ IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+ INFO = -12
+ ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+ INFO = -13
+ END IF
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) )
+ $ INFO = -18
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ WORK( 1 ) = LWMIN
+ ELSE
+ IB = ILAENV( 18, 'CHETRD_HB2ST', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+ LWMIN = LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+ ENDIF
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -20
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHBEVX_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ M = 0
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ M = 1
+ IF( LOWER ) THEN
+ CTMP1 = AB( 1, 1 )
+ ELSE
+ CTMP1 = AB( KD+1, 1 )
+ END IF
+ TMP1 = REAL( CTMP1 )
+ IF( VALEIG ) THEN
+ IF( .NOT.( VL.LT.TMP1 .AND. VU.GE.TMP1 ) )
+ $ M = 0
+ END IF
+ IF( M.EQ.1 ) THEN
+ W( 1 ) = REAL( CTMP1 )
+ IF( WANTZ )
+ $ Z( 1, 1 ) = CONE
+ END IF
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ISCALE = 0
+ ABSTLL = ABSTOL
+ IF( VALEIG ) THEN
+ VLL = VL
+ VUU = VU
+ ELSE
+ VLL = ZERO
+ VUU = ZERO
+ END IF
+ ANRM = CLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK )
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ CALL CLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ ELSE
+ CALL CLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ END IF
+ IF( ABSTOL.GT.0 )
+ $ ABSTLL = ABSTOL*SIGMA
+ IF( VALEIG ) THEN
+ VLL = VL*SIGMA
+ VUU = VU*SIGMA
+ END IF
+ END IF
+*
+* Call CHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form.
+*
+ INDD = 1
+ INDE = INDD + N
+ INDRWK = INDE + N
+*
+ INDHOUS = 1
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL CHETRD_HB2ST( 'N', JOBZ, UPLO, N, KD, AB, LDAB,
+ $ RWORK( INDD ), RWORK( INDE ), WORK( INDHOUS ),
+ $ LHTRD, WORK( INDWRK ), LLWORK, IINFO )
+*
+* If all eigenvalues are desired and ABSTOL is less than or equal
+* to zero, then call SSTERF or CSTEQR. If this fails for some
+* eigenvalue, then try SSTEBZ.
+*
+ TEST = .FALSE.
+ IF (INDEIG) THEN
+ IF (IL.EQ.1 .AND. IU.EQ.N) THEN
+ TEST = .TRUE.
+ END IF
+ END IF
+ IF ((ALLEIG .OR. TEST) .AND. (ABSTOL.LE.ZERO)) THEN
+ CALL SCOPY( N, RWORK( INDD ), 1, W, 1 )
+ INDEE = INDRWK + 2*N
+ IF( .NOT.WANTZ ) THEN
+ CALL SCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+ CALL SSTERF( N, W, RWORK( INDEE ), INFO )
+ ELSE
+ CALL CLACPY( 'A', N, N, Q, LDQ, Z, LDZ )
+ CALL SCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+ CALL CSTEQR( JOBZ, N, W, RWORK( INDEE ), Z, LDZ,
+ $ RWORK( INDRWK ), INFO )
+ IF( INFO.EQ.0 ) THEN
+ DO 10 I = 1, N
+ IFAIL( I ) = 0
+ 10 CONTINUE
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ M = N
+ GO TO 30
+ END IF
+ INFO = 0
+ END IF
+*
+* Otherwise, call SSTEBZ and, if eigenvectors are desired, CSTEIN.
+*
+ IF( WANTZ ) THEN
+ ORDER = 'B'
+ ELSE
+ ORDER = 'E'
+ END IF
+ INDIBL = 1
+ INDISP = INDIBL + N
+ INDIWK = INDISP + N
+ CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+ $ RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ),
+ $ IWORK( INDIWK ), INFO )
+*
+ IF( WANTZ ) THEN
+ CALL CSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+ $ RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO )
+*
+* Apply unitary matrix used in reduction to tridiagonal
+* form to eigenvectors returned by CSTEIN.
+*
+ DO 20 J = 1, M
+ CALL CCOPY( N, Z( 1, J ), 1, WORK( 1 ), 1 )
+ CALL CGEMV( 'N', N, N, CONE, Q, LDQ, WORK, 1, CZERO,
+ $ Z( 1, J ), 1 )
+ 20 CONTINUE
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ 30 CONTINUE
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = M
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* If eigenvalues are not in order, then sort them, along with
+* eigenvectors.
+*
+ IF( WANTZ ) THEN
+ DO 50 J = 1, M - 1
+ I = 0
+ TMP1 = W( J )
+ DO 40 JJ = J + 1, M
+ IF( W( JJ ).LT.TMP1 ) THEN
+ I = JJ
+ TMP1 = W( JJ )
+ END IF
+ 40 CONTINUE
+*
+ IF( I.NE.0 ) THEN
+ ITMP1 = IWORK( INDIBL+I-1 )
+ W( I ) = W( J )
+ IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+ W( J ) = TMP1
+ IWORK( INDIBL+J-1 ) = ITMP1
+ CALL CSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+ IF( INFO.NE.0 ) THEN
+ ITMP1 = IFAIL( I )
+ IFAIL( I ) = IFAIL( J )
+ IFAIL( J ) = ITMP1
+ END IF
+ END IF
+ 50 CONTINUE
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of CHBEVX_2STAGE
+*
+ END
diff --git a/SRC/cheev_2stage.f b/SRC/cheev_2stage.f
new file mode 100644
index 00000000..b98dac76
--- /dev/null
+++ b/SRC/cheev_2stage.f
@@ -0,0 +1,355 @@
+*> \brief <b> CHEEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+* @generated from zheev_2stage.f, fortran z -> c, Sat Nov 5 23:18:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHEEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cheev_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cheev_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cheev_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+* RWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, LDA, LWORK, N
+* ..
+* .. Array Arguments ..
+* REAL RWORK( * ), W( * )
+* COMPLEX A( LDA, * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CHEEV_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX array, dimension (LDA, N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*> orthonormal eigenvectors of the matrix A.
+*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*> or the upper triangle (if UPLO='U') of A, including the
+*> diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is REAL array, dimension (max(1, 3*N-2))
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, the algorithm failed to converge; i
+*> off-diagonal elements of an intermediate tridiagonal
+*> form did not converge to zero.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexHEeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+ $ RWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, LDA, LWORK, N
+* ..
+* .. Array Arguments ..
+ REAL RWORK( * ), W( * )
+ COMPLEX A( LDA, * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+ COMPLEX CONE
+ PARAMETER ( CONE = ( 1.0E0, 0.0E0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, LQUERY, WANTZ
+ INTEGER IINFO, IMAX, INDE, INDTAU, INDWRK, ISCALE,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+ REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, CLANHE
+ EXTERNAL LSAME, ILAENV, SLAMCH, CLANHE
+* ..
+* .. External Subroutines ..
+ EXTERNAL SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR,
+ $ CUNGTR, CHETRD_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC REAL, MAX, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ KD = ILAENV( 17, 'CHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -8
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHEEV_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 ) THEN
+ RETURN
+ END IF
+*
+ IF( N.EQ.1 ) THEN
+ W( 1 ) = REAL( A( 1, 1 ) )
+ WORK( 1 ) = 1
+ IF( WANTZ )
+ $ A( 1, 1 ) = CONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = CLANHE( 'M', UPLO, N, A, LDA, RWORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 )
+ $ CALL CLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+* Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+ INDE = 1
+ INDTAU = 1
+ INDHOUS = INDTAU + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ),
+ $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call SSTERF. For eigenvectors, first call
+* CUNGTR to generate the unitary matrix, then call CSTEQR.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL SSTERF( N, W, RWORK( INDE ), INFO )
+ ELSE
+ CALL CUNGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ),
+ $ LLWORK, IINFO )
+ INDWRK = INDE + N
+ CALL CSTEQR( JOBZ, N, W, RWORK( INDE ), A, LDA,
+ $ RWORK( INDWRK ), INFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = N
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* Set WORK(1) to optimal complex workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of CHEEV_2STAGE
+*
+ END
diff --git a/SRC/cheevd_2stage.f b/SRC/cheevd_2stage.f
new file mode 100644
index 00000000..9d1057fc
--- /dev/null
+++ b/SRC/cheevd_2stage.f
@@ -0,0 +1,451 @@
+*> \brief <b> CHEEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+* @generated from zheevd_2stage.f, fortran z -> c, Sat Nov 5 23:18:14 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHEEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cheevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cheevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cheevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CHEEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+* RWORK, LRWORK, IWORK, LIWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, LDA, LIWORK, LRWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+* INTEGER IWORK( * )
+* REAL RWORK( * ), W( * )
+* COMPLEX A( LDA, * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal. If eigenvectors are desired, it uses a
+*> divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX array, dimension (LDA, N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*> orthonormal eigenvectors of the matrix A.
+*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*> or the upper triangle (if UPLO='U') of A, including the
+*> diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK.
+*> If N <= 1, LWORK must be at least 1.
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + N+1
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + N+1
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be at least 2*N + N**2
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK, RWORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is REAL array,
+*> dimension (LRWORK)
+*> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*> LRWORK is INTEGER
+*> The dimension of the array RWORK.
+*> If N <= 1, LRWORK must be at least 1.
+*> If JOBZ = 'N' and N > 1, LRWORK must be at least N.
+*> If JOBZ = 'V' and N > 1, LRWORK must be at least
+*> 1 + 5*N + 2*N**2.
+*>
+*> If LRWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK, RWORK
+*> and IWORK arrays, returns these values as the first entries
+*> of the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*> LIWORK is INTEGER
+*> The dimension of the array IWORK.
+*> If N <= 1, LIWORK must be at least 1.
+*> If JOBZ = 'N' and N > 1, LIWORK must be at least 1.
+*> If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
+*>
+*> If LIWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK, RWORK
+*> and IWORK arrays, returns these values as the first entries
+*> of the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i and JOBZ = 'N', then the algorithm failed
+*> to converge; i off-diagonal elements of an intermediate
+*> tridiagonal form did not converge to zero;
+*> if INFO = i and JOBZ = 'V', then the algorithm failed
+*> to compute an eigenvalue while working on the submatrix
+*> lying in rows and columns INFO/(N+1) through
+*> mod(INFO,N+1).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexHEeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> Modified description of INFO. Sven, 16 Feb 05.
+*
+*> \par Contributors:
+* ==================
+*>
+*> Jeff Rutter, Computer Science Division, University of California
+*> at Berkeley, USA
+*>
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE CHEEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+ $ RWORK, LRWORK, IWORK, LIWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, LDA, LIWORK, LRWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+ INTEGER IWORK( * )
+ REAL RWORK( * ), W( * )
+ COMPLEX A( LDA, * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+ COMPLEX CONE
+ PARAMETER ( CONE = ( 1.0E0, 0.0E0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, LQUERY, WANTZ
+ INTEGER IINFO, IMAX, INDE, INDRWK, INDTAU, INDWK2,
+ $ INDWRK, ISCALE, LIWMIN, LLRWK, LLWORK,
+ $ LLWRK2, LRWMIN, LWMIN,
+ $ LHTRD, LWTRD, KD, IB, INDHOUS
+
+
+ REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, CLANHE
+ EXTERNAL LSAME, ILAENV, SLAMCH, CLANHE
+* ..
+* .. External Subroutines ..
+ EXTERNAL SSCAL, SSTERF, XERBLA, CLACPY, CLASCL,
+ $ CSTEDC, CUNMTR, CHETRD_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC REAL, MAX, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 .OR. LIWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ LRWMIN = 1
+ LIWMIN = 1
+ ELSE
+ KD = ILAENV( 17, 'CHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ IF( WANTZ ) THEN
+ LWMIN = 2*N + N*N
+ LRWMIN = 1 + 5*N + 2*N**2
+ LIWMIN = 3 + 5*N
+ ELSE
+ LWMIN = N + 1 + LHTRD + LWTRD
+ LRWMIN = N
+ LIWMIN = 1
+ END IF
+ END IF
+ WORK( 1 ) = LWMIN
+ RWORK( 1 ) = LRWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -8
+ ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -10
+ ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -12
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHEEVD_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ W( 1 ) = REAL( A( 1, 1 ) )
+ IF( WANTZ )
+ $ A( 1, 1 ) = CONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = CLANHE( 'M', UPLO, N, A, LDA, RWORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 )
+ $ CALL CLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+* Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+ INDE = 1
+ INDRWK = INDE + N
+ LLRWK = LRWORK - INDRWK + 1
+ INDTAU = 1
+ INDHOUS = INDTAU + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+ INDWK2 = INDWRK + N*N
+ LLWRK2 = LWORK - INDWK2 + 1
+*
+ CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ),
+ $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call SSTERF. For eigenvectors, first call
+* CSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
+* tridiagonal matrix, then call CUNMTR to multiply it to the
+* Householder transformations represented as Householder vectors in
+* A.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL SSTERF( N, W, RWORK( INDE ), INFO )
+ ELSE
+ CALL CSTEDC( 'I', N, W, RWORK( INDE ), WORK( INDWRK ), N,
+ $ WORK( INDWK2 ), LLWRK2, RWORK( INDRWK ), LLRWK,
+ $ IWORK, LIWORK, INFO )
+ CALL CUNMTR( 'L', UPLO, 'N', N, N, A, LDA, WORK( INDTAU ),
+ $ WORK( INDWRK ), N, WORK( INDWK2 ), LLWRK2, IINFO )
+ CALL CLACPY( 'A', N, N, WORK( INDWRK ), N, A, LDA )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = N
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+ WORK( 1 ) = LWMIN
+ RWORK( 1 ) = LRWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ RETURN
+*
+* End of CHEEVD_2STAGE
+*
+ END
diff --git a/SRC/cheevr_2stage.f b/SRC/cheevr_2stage.f
new file mode 100644
index 00000000..23a98389
--- /dev/null
+++ b/SRC/cheevr_2stage.f
@@ -0,0 +1,779 @@
+*> \brief <b> CHEEVR_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+* @generated from zheevr_2stage.f, fortran z -> c, Sat Nov 5 23:18:11 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHEEVR_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cheevr_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cheevr_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cheevr_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CHEEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+* IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ,
+* WORK, LWORK, RWORK, LRWORK, IWORK,
+* LIWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, RANGE, UPLO
+* INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LRWORK, LWORK,
+* $ M, N
+* REAL ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+* INTEGER ISUPPZ( * ), IWORK( * )
+* REAL RWORK( * ), W( * )
+* COMPLEX A( LDA, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CHEEVR_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors can
+*> be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*>
+*> CHEEVR_2STAGE first reduces the matrix A to tridiagonal form T with a call
+*> to CHETRD. Then, whenever possible, CHEEVR_2STAGE calls CSTEMR to compute
+*> eigenspectrum using Relatively Robust Representations. CSTEMR
+*> computes eigenvalues by the dqds algorithm, while orthogonal
+*> eigenvectors are computed from various "good" L D L^T representations
+*> (also known as Relatively Robust Representations). Gram-Schmidt
+*> orthogonalization is avoided as far as possible. More specifically,
+*> the various steps of the algorithm are as follows.
+*>
+*> For each unreduced block (submatrix) of T,
+*> (a) Compute T - sigma I = L D L^T, so that L and D
+*> define all the wanted eigenvalues to high relative accuracy.
+*> This means that small relative changes in the entries of D and L
+*> cause only small relative changes in the eigenvalues and
+*> eigenvectors. The standard (unfactored) representation of the
+*> tridiagonal matrix T does not have this property in general.
+*> (b) Compute the eigenvalues to suitable accuracy.
+*> If the eigenvectors are desired, the algorithm attains full
+*> accuracy of the computed eigenvalues only right before
+*> the corresponding vectors have to be computed, see steps c) and d).
+*> (c) For each cluster of close eigenvalues, select a new
+*> shift close to the cluster, find a new factorization, and refine
+*> the shifted eigenvalues to suitable accuracy.
+*> (d) For each eigenvalue with a large enough relative separation compute
+*> the corresponding eigenvector by forming a rank revealing twisted
+*> factorization. Go back to (c) for any clusters that remain.
+*>
+*> The desired accuracy of the output can be specified by the input
+*> parameter ABSTOL.
+*>
+*> For more details, see DSTEMR's documentation and:
+*> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+*> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+*> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+*> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+*> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+*> 2004. Also LAPACK Working Note 154.
+*> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+*> tridiagonal eigenvalue/eigenvector problem",
+*> Computer Science Division Technical Report No. UCB/CSD-97-971,
+*> UC Berkeley, May 1997.
+*>
+*>
+*> Note 1 : CHEEVR_2STAGE calls CSTEMR when the full spectrum is requested
+*> on machines which conform to the ieee-754 floating point standard.
+*> CHEEVR_2STAGE calls SSTEBZ and CSTEIN on non-ieee machines and
+*> when partial spectrum requests are made.
+*>
+*> Normal execution of CSTEMR may create NaNs and infinities and
+*> hence may abort due to a floating point exception in environments
+*> which do not handle NaNs and infinities in the ieee standard default
+*> manner.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*> RANGE is CHARACTER*1
+*> = 'A': all eigenvalues will be found.
+*> = 'V': all eigenvalues in the half-open interval (VL,VU]
+*> will be found.
+*> = 'I': the IL-th through IU-th eigenvalues will be found.
+*> For RANGE = 'V' or 'I' and IU - IL < N - 1, SSTEBZ and
+*> CSTEIN are called
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX array, dimension (LDA, N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, the lower triangle (if UPLO='L') or the upper
+*> triangle (if UPLO='U') of A, including the diagonal, is
+*> destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*> VL is REAL
+*> If RANGE='V', the lower bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*> VU is REAL
+*> If RANGE='V', the upper bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*> IL is INTEGER
+*> If RANGE='I', the index of the
+*> smallest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*> IU is INTEGER
+*> If RANGE='I', the index of the
+*> largest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*> ABSTOL is REAL
+*> The absolute error tolerance for the eigenvalues.
+*> An approximate eigenvalue is accepted as converged
+*> when it is determined to lie in an interval [a,b]
+*> of width less than or equal to
+*>
+*> ABSTOL + EPS * max( |a|,|b| ) ,
+*>
+*> where EPS is the machine precision. If ABSTOL is less than
+*> or equal to zero, then EPS*|T| will be used in its place,
+*> where |T| is the 1-norm of the tridiagonal matrix obtained
+*> by reducing A to tridiagonal form.
+*>
+*> See "Computing Small Singular Values of Bidiagonal Matrices
+*> with Guaranteed High Relative Accuracy," by Demmel and
+*> Kahan, LAPACK Working Note #3.
+*>
+*> If high relative accuracy is important, set ABSTOL to
+*> SLAMCH( 'Safe minimum' ). Doing so will guarantee that
+*> eigenvalues are computed to high relative accuracy when
+*> possible in future releases. The current code does not
+*> make any guarantees about high relative accuracy, but
+*> furutre releases will. See J. Barlow and J. Demmel,
+*> "Computing Accurate Eigensystems of Scaled Diagonally
+*> Dominant Matrices", LAPACK Working Note #7, for a discussion
+*> of which matrices define their eigenvalues to high relative
+*> accuracy.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*> M is INTEGER
+*> The total number of eigenvalues found. 0 <= M <= N.
+*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> The first M elements contain the selected eigenvalues in
+*> ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is COMPLEX array, dimension (LDZ, max(1,M))
+*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*> contain the orthonormal eigenvectors of the matrix A
+*> corresponding to the selected eigenvalues, with the i-th
+*> column of Z holding the eigenvector associated with W(i).
+*> If JOBZ = 'N', then Z is not referenced.
+*> Note: the user must ensure that at least max(1,M) columns are
+*> supplied in the array Z; if RANGE = 'V', the exact value of M
+*> is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] ISUPPZ
+*> \verbatim
+*> ISUPPZ is INTEGER array, dimension ( 2*max(1,M) )
+*> The support of the eigenvectors in Z, i.e., the indices
+*> indicating the nonzero elements in Z. The i-th eigenvector
+*> is nonzero only in elements ISUPPZ( 2*i-1 ) through
+*> ISUPPZ( 2*i ). This is an output of CSTEMR (tridiagonal
+*> matrix). The support of the eigenvectors of A is typically
+*> 1:N because of the unitary transformations applied by CUNMTR.
+*> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK.
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, 26*N, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK, RWORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is REAL array, dimension (MAX(1,LRWORK))
+*> On exit, if INFO = 0, RWORK(1) returns the optimal
+*> (and minimal) LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*> LRWORK is INTEGER
+*> The length of the array RWORK. LRWORK >= max(1,24*N).
+*>
+*> If LRWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK, RWORK
+*> and IWORK arrays, returns these values as the first entries
+*> of the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*> On exit, if INFO = 0, IWORK(1) returns the optimal
+*> (and minimal) LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*> LIWORK is INTEGER
+*> The dimension of the array IWORK. LIWORK >= max(1,10*N).
+*>
+*> If LIWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK, RWORK
+*> and IWORK arrays, returns these values as the first entries
+*> of the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: Internal error
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup complexHEeigen
+*
+*> \par Contributors:
+* ==================
+*>
+*> Inderjit Dhillon, IBM Almaden, USA \n
+*> Osni Marques, LBNL/NERSC, USA \n
+*> Ken Stanley, Computer Science Division, University of
+*> California at Berkeley, USA \n
+*> Jason Riedy, Computer Science Division, University of
+*> California at Berkeley, USA \n
+*>
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE CHEEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+ $ IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ,
+ $ WORK, LWORK, RWORK, LRWORK, IWORK,
+ $ LIWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, RANGE, UPLO
+ INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LRWORK, LWORK,
+ $ M, N
+ REAL ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+ INTEGER ISUPPZ( * ), IWORK( * )
+ REAL RWORK( * ), W( * )
+ COMPLEX A( LDA, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE, TWO
+ PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG,
+ $ WANTZ, TRYRAC
+ CHARACTER ORDER
+ INTEGER I, IEEEOK, IINFO, IMAX, INDIBL, INDIFL, INDISP,
+ $ INDIWO, INDRD, INDRDD, INDRE, INDREE, INDRWK,
+ $ INDTAU, INDWK, INDWKN, ISCALE, ITMP1, J, JJ,
+ $ LIWMIN, LLWORK, LLRWORK, LLWRKN, LRWMIN,
+ $ LWMIN, NSPLIT, LHTRD, LWTRD, KD, IB, INDHOUS
+ REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+ $ SIGMA, SMLNUM, TMP1, VLL, VUU
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, CLANSY
+ EXTERNAL LSAME, ILAENV, SLAMCH, CLANSY
+* ..
+* .. External Subroutines ..
+ EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL,
+ $ CHETRD_2STAGE, CSTEMR, CSTEIN, CSWAP, CUNMTR
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC REAL, MAX, MIN, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ IEEEOK = ILAENV( 10, 'CHEEVR', 'N', 1, 2, 3, 4 )
+*
+ LOWER = LSAME( UPLO, 'L' )
+ WANTZ = LSAME( JOBZ, 'V' )
+ ALLEIG = LSAME( RANGE, 'A' )
+ VALEIG = LSAME( RANGE, 'V' )
+ INDEIG = LSAME( RANGE, 'I' )
+*
+ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR.
+ $ ( LIWORK.EQ.-1 ) )
+*
+ KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = N + LHTRD + LWTRD
+ LRWMIN = MAX( 1, 24*N )
+ LIWMIN = MAX( 1, 10*N )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -6
+ ELSE
+ IF( VALEIG ) THEN
+ IF( N.GT.0 .AND. VU.LE.VL )
+ $ INFO = -8
+ ELSE IF( INDEIG ) THEN
+ IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+ INFO = -9
+ ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+ INFO = -10
+ END IF
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -15
+ END IF
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ WORK( 1 ) = LWMIN
+ RWORK( 1 ) = LRWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -18
+ ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -20
+ ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -22
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHEEVR_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ M = 0
+ IF( N.EQ.0 ) THEN
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+ IF( N.EQ.1 ) THEN
+ WORK( 1 ) = 2
+ IF( ALLEIG .OR. INDEIG ) THEN
+ M = 1
+ W( 1 ) = REAL( A( 1, 1 ) )
+ ELSE
+ IF( VL.LT.REAL( A( 1, 1 ) ) .AND. VU.GE.REAL( A( 1, 1 ) ) )
+ $ THEN
+ M = 1
+ W( 1 ) = REAL( A( 1, 1 ) )
+ END IF
+ END IF
+ IF( WANTZ ) THEN
+ Z( 1, 1 ) = ONE
+ ISUPPZ( 1 ) = 1
+ ISUPPZ( 2 ) = 1
+ END IF
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ISCALE = 0
+ ABSTLL = ABSTOL
+ IF (VALEIG) THEN
+ VLL = VL
+ VUU = VU
+ END IF
+ ANRM = CLANSY( 'M', UPLO, N, A, LDA, RWORK )
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ DO 10 J = 1, N
+ CALL CSSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+ 10 CONTINUE
+ ELSE
+ DO 20 J = 1, N
+ CALL CSSCAL( J, SIGMA, A( 1, J ), 1 )
+ 20 CONTINUE
+ END IF
+ IF( ABSTOL.GT.0 )
+ $ ABSTLL = ABSTOL*SIGMA
+ IF( VALEIG ) THEN
+ VLL = VL*SIGMA
+ VUU = VU*SIGMA
+ END IF
+ END IF
+
+* Initialize indices into workspaces. Note: The IWORK indices are
+* used only if SSTERF or CSTEMR fail.
+
+* WORK(INDTAU:INDTAU+N-1) stores the complex scalar factors of the
+* elementary reflectors used in CHETRD.
+ INDTAU = 1
+* INDWK is the starting offset of the remaining complex workspace,
+* and LLWORK is the remaining complex workspace size.
+ INDHOUS = INDTAU + N
+ INDWK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWK + 1
+
+* RWORK(INDRD:INDRD+N-1) stores the real tridiagonal's diagonal
+* entries.
+ INDRD = 1
+* RWORK(INDRE:INDRE+N-1) stores the off-diagonal entries of the
+* tridiagonal matrix from CHETRD.
+ INDRE = INDRD + N
+* RWORK(INDRDD:INDRDD+N-1) is a copy of the diagonal entries over
+* -written by CSTEMR (the SSTERF path copies the diagonal to W).
+ INDRDD = INDRE + N
+* RWORK(INDREE:INDREE+N-1) is a copy of the off-diagonal entries over
+* -written while computing the eigenvalues in SSTERF and CSTEMR.
+ INDREE = INDRDD + N
+* INDRWK is the starting offset of the left-over real workspace, and
+* LLRWORK is the remaining workspace size.
+ INDRWK = INDREE + N
+ LLRWORK = LRWORK - INDRWK + 1
+
+* IWORK(INDIBL:INDIBL+M-1) corresponds to IBLOCK in SSTEBZ and
+* stores the block indices of each of the M<=N eigenvalues.
+ INDIBL = 1
+* IWORK(INDISP:INDISP+NSPLIT-1) corresponds to ISPLIT in SSTEBZ and
+* stores the starting and finishing indices of each block.
+ INDISP = INDIBL + N
+* IWORK(INDIFL:INDIFL+N-1) stores the indices of eigenvectors
+* that corresponding to eigenvectors that fail to converge in
+* CSTEIN. This information is discarded; if any fail, the driver
+* returns INFO > 0.
+ INDIFL = INDISP + N
+* INDIWO is the offset of the remaining integer workspace.
+ INDIWO = INDIFL + N
+
+*
+* Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+ CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ),
+ $ RWORK( INDRE ), WORK( INDTAU ),
+ $ WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWK ), LLWORK, IINFO )
+*
+* If all eigenvalues are desired
+* then call SSTERF or CSTEMR and CUNMTR.
+*
+ TEST = .FALSE.
+ IF( INDEIG ) THEN
+ IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+ TEST = .TRUE.
+ END IF
+ END IF
+ IF( ( ALLEIG.OR.TEST ) .AND. ( IEEEOK.EQ.1 ) ) THEN
+ IF( .NOT.WANTZ ) THEN
+ CALL SCOPY( N, RWORK( INDRD ), 1, W, 1 )
+ CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 )
+ CALL SSTERF( N, W, RWORK( INDREE ), INFO )
+ ELSE
+ CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 )
+ CALL SCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 )
+*
+ IF (ABSTOL .LE. TWO*N*EPS) THEN
+ TRYRAC = .TRUE.
+ ELSE
+ TRYRAC = .FALSE.
+ END IF
+ CALL CSTEMR( JOBZ, 'A', N, RWORK( INDRDD ),
+ $ RWORK( INDREE ), VL, VU, IL, IU, M, W,
+ $ Z, LDZ, N, ISUPPZ, TRYRAC,
+ $ RWORK( INDRWK ), LLRWORK,
+ $ IWORK, LIWORK, INFO )
+*
+* Apply unitary matrix used in reduction to tridiagonal
+* form to eigenvectors returned by CSTEMR.
+*
+ IF( WANTZ .AND. INFO.EQ.0 ) THEN
+ INDWKN = INDWK
+ LLWRKN = LWORK - INDWKN + 1
+ CALL CUNMTR( 'L', UPLO, 'N', N, M, A, LDA,
+ $ WORK( INDTAU ), Z, LDZ, WORK( INDWKN ),
+ $ LLWRKN, IINFO )
+ END IF
+ END IF
+*
+*
+ IF( INFO.EQ.0 ) THEN
+ M = N
+ GO TO 30
+ END IF
+ INFO = 0
+ END IF
+*
+* Otherwise, call SSTEBZ and, if eigenvectors are desired, CSTEIN.
+* Also call SSTEBZ and CSTEIN if CSTEMR fails.
+*
+ IF( WANTZ ) THEN
+ ORDER = 'B'
+ ELSE
+ ORDER = 'E'
+ END IF
+
+ CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+ $ RWORK( INDRD ), RWORK( INDRE ), M, NSPLIT, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ),
+ $ IWORK( INDIWO ), INFO )
+*
+ IF( WANTZ ) THEN
+ CALL CSTEIN( N, RWORK( INDRD ), RWORK( INDRE ), M, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+ $ RWORK( INDRWK ), IWORK( INDIWO ), IWORK( INDIFL ),
+ $ INFO )
+*
+* Apply unitary matrix used in reduction to tridiagonal
+* form to eigenvectors returned by CSTEIN.
+*
+ INDWKN = INDWK
+ LLWRKN = LWORK - INDWKN + 1
+ CALL CUNMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+ $ LDZ, WORK( INDWKN ), LLWRKN, IINFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ 30 CONTINUE
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = M
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* If eigenvalues are not in order, then sort them, along with
+* eigenvectors.
+*
+ IF( WANTZ ) THEN
+ DO 50 J = 1, M - 1
+ I = 0
+ TMP1 = W( J )
+ DO 40 JJ = J + 1, M
+ IF( W( JJ ).LT.TMP1 ) THEN
+ I = JJ
+ TMP1 = W( JJ )
+ END IF
+ 40 CONTINUE
+*
+ IF( I.NE.0 ) THEN
+ ITMP1 = IWORK( INDIBL+I-1 )
+ W( I ) = W( J )
+ IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+ W( J ) = TMP1
+ IWORK( INDIBL+J-1 ) = ITMP1
+ CALL CSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+ END IF
+ 50 CONTINUE
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+ RWORK( 1 ) = LRWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ RETURN
+*
+* End of CHEEVR_2STAGE
+*
+ END
diff --git a/SRC/cheevx_2stage.f b/SRC/cheevx_2stage.f
new file mode 100644
index 00000000..84ae438d
--- /dev/null
+++ b/SRC/cheevx_2stage.f
@@ -0,0 +1,618 @@
+*> \brief <b> CHEEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+* @generated from zheevx_2stage.f, fortran z -> c, Sat Nov 5 23:18:09 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHEEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cheevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cheevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cheevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CHEEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+* IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+* LWORK, RWORK, IWORK, IFAIL, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, RANGE, UPLO
+* INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N
+* REAL ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+* INTEGER IFAIL( * ), IWORK( * )
+* REAL RWORK( * ), W( * )
+* COMPLEX A( LDA, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CHEEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors can
+*> be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*> RANGE is CHARACTER*1
+*> = 'A': all eigenvalues will be found.
+*> = 'V': all eigenvalues in the half-open interval (VL,VU]
+*> will be found.
+*> = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX array, dimension (LDA, N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, the lower triangle (if UPLO='L') or the upper
+*> triangle (if UPLO='U') of A, including the diagonal, is
+*> destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*> VL is REAL
+*> If RANGE='V', the lower bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*> VU is REAL
+*> If RANGE='V', the upper bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*> IL is INTEGER
+*> If RANGE='I', the index of the
+*> smallest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*> IU is INTEGER
+*> If RANGE='I', the index of the
+*> largest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*> ABSTOL is REAL
+*> The absolute error tolerance for the eigenvalues.
+*> An approximate eigenvalue is accepted as converged
+*> when it is determined to lie in an interval [a,b]
+*> of width less than or equal to
+*>
+*> ABSTOL + EPS * max( |a|,|b| ) ,
+*>
+*> where EPS is the machine precision. If ABSTOL is less than
+*> or equal to zero, then EPS*|T| will be used in its place,
+*> where |T| is the 1-norm of the tridiagonal matrix obtained
+*> by reducing A to tridiagonal form.
+*>
+*> Eigenvalues will be computed most accurately when ABSTOL is
+*> set to twice the underflow threshold 2*SLAMCH('S'), not zero.
+*> If this routine returns with INFO>0, indicating that some
+*> eigenvectors did not converge, try setting ABSTOL to
+*> 2*SLAMCH('S').
+*>
+*> See "Computing Small Singular Values of Bidiagonal Matrices
+*> with Guaranteed High Relative Accuracy," by Demmel and
+*> Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*> M is INTEGER
+*> The total number of eigenvalues found. 0 <= M <= N.
+*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> On normal exit, the first M elements contain the selected
+*> eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is COMPLEX array, dimension (LDZ, max(1,M))
+*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*> contain the orthonormal eigenvectors of the matrix A
+*> corresponding to the selected eigenvalues, with the i-th
+*> column of Z holding the eigenvector associated with W(i).
+*> If an eigenvector fails to converge, then that column of Z
+*> contains the latest approximation to the eigenvector, and the
+*> index of the eigenvector is returned in IFAIL.
+*> If JOBZ = 'N', then Z is not referenced.
+*> Note: the user must ensure that at least max(1,M) columns are
+*> supplied in the array Z; if RANGE = 'V', the exact value of M
+*> is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, 8*N, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is REAL array, dimension (7*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*> IFAIL is INTEGER array, dimension (N)
+*> If JOBZ = 'V', then if INFO = 0, the first M elements of
+*> IFAIL are zero. If INFO > 0, then IFAIL contains the
+*> indices of the eigenvectors that failed to converge.
+*> If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, then i eigenvectors failed to converge.
+*> Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup complexHEeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE CHEEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+ $ IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+ $ LWORK, RWORK, IWORK, IFAIL, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, RANGE, UPLO
+ INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N
+ REAL ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+ INTEGER IFAIL( * ), IWORK( * )
+ REAL RWORK( * ), W( * )
+ COMPLEX A( LDA, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+ COMPLEX CONE
+ PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG,
+ $ WANTZ
+ CHARACTER ORDER
+ INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+ $ INDISP, INDIWK, INDRWK, INDTAU, INDWRK, ISCALE,
+ $ ITMP1, J, JJ, LLWORK,
+ $ NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+ REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+ $ SIGMA, SMLNUM, TMP1, VLL, VUU
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, CLANHE
+ EXTERNAL LSAME, ILAENV, SLAMCH, CLANHE
+* ..
+* .. External Subroutines ..
+ EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL,
+ $ CLACPY, CSTEIN, CSTEQR, CSWAP, CUNGTR, CUNMTR,
+ $ CHETRD_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC REAL, MAX, MIN, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ LOWER = LSAME( UPLO, 'L' )
+ WANTZ = LSAME( JOBZ, 'V' )
+ ALLEIG = LSAME( RANGE, 'A' )
+ VALEIG = LSAME( RANGE, 'V' )
+ INDEIG = LSAME( RANGE, 'I' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -6
+ ELSE
+ IF( VALEIG ) THEN
+ IF( N.GT.0 .AND. VU.LE.VL )
+ $ INFO = -8
+ ELSE IF( INDEIG ) THEN
+ IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+ INFO = -9
+ ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+ INFO = -10
+ END IF
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -15
+ END IF
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ WORK( 1 ) = LWMIN
+ ELSE
+ KD = ILAENV( 17, 'CHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+ END IF
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -17
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHEEVX_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ M = 0
+ IF( N.EQ.0 ) THEN
+ RETURN
+ END IF
+*
+ IF( N.EQ.1 ) THEN
+ IF( ALLEIG .OR. INDEIG ) THEN
+ M = 1
+ W( 1 ) = REAL( A( 1, 1 ) )
+ ELSE IF( VALEIG ) THEN
+ IF( VL.LT.REAL( A( 1, 1 ) ) .AND. VU.GE.REAL( A( 1, 1 ) ) )
+ $ THEN
+ M = 1
+ W( 1 ) = REAL( A( 1, 1 ) )
+ END IF
+ END IF
+ IF( WANTZ )
+ $ Z( 1, 1 ) = CONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ISCALE = 0
+ ABSTLL = ABSTOL
+ IF( VALEIG ) THEN
+ VLL = VL
+ VUU = VU
+ END IF
+ ANRM = CLANHE( 'M', UPLO, N, A, LDA, RWORK )
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ DO 10 J = 1, N
+ CALL CSSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+ 10 CONTINUE
+ ELSE
+ DO 20 J = 1, N
+ CALL CSSCAL( J, SIGMA, A( 1, J ), 1 )
+ 20 CONTINUE
+ END IF
+ IF( ABSTOL.GT.0 )
+ $ ABSTLL = ABSTOL*SIGMA
+ IF( VALEIG ) THEN
+ VLL = VL*SIGMA
+ VUU = VU*SIGMA
+ END IF
+ END IF
+*
+* Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+ INDD = 1
+ INDE = INDD + N
+ INDRWK = INDE + N
+ INDTAU = 1
+ INDHOUS = INDTAU + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDD ),
+ $ RWORK( INDE ), WORK( INDTAU ),
+ $ WORK( INDHOUS ), LHTRD, WORK( INDWRK ),
+ $ LLWORK, IINFO )
+*
+* If all eigenvalues are desired and ABSTOL is less than or equal to
+* zero, then call SSTERF or CUNGTR and CSTEQR. If this fails for
+* some eigenvalue, then try SSTEBZ.
+*
+ TEST = .FALSE.
+ IF( INDEIG ) THEN
+ IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+ TEST = .TRUE.
+ END IF
+ END IF
+ IF( ( ALLEIG .OR. TEST ) .AND. ( ABSTOL.LE.ZERO ) ) THEN
+ CALL SCOPY( N, RWORK( INDD ), 1, W, 1 )
+ INDEE = INDRWK + 2*N
+ IF( .NOT.WANTZ ) THEN
+ CALL SCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+ CALL SSTERF( N, W, RWORK( INDEE ), INFO )
+ ELSE
+ CALL CLACPY( 'A', N, N, A, LDA, Z, LDZ )
+ CALL CUNGTR( UPLO, N, Z, LDZ, WORK( INDTAU ),
+ $ WORK( INDWRK ), LLWORK, IINFO )
+ CALL SCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+ CALL CSTEQR( JOBZ, N, W, RWORK( INDEE ), Z, LDZ,
+ $ RWORK( INDRWK ), INFO )
+ IF( INFO.EQ.0 ) THEN
+ DO 30 I = 1, N
+ IFAIL( I ) = 0
+ 30 CONTINUE
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ M = N
+ GO TO 40
+ END IF
+ INFO = 0
+ END IF
+*
+* Otherwise, call SSTEBZ and, if eigenvectors are desired, CSTEIN.
+*
+ IF( WANTZ ) THEN
+ ORDER = 'B'
+ ELSE
+ ORDER = 'E'
+ END IF
+ INDIBL = 1
+ INDISP = INDIBL + N
+ INDIWK = INDISP + N
+ CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+ $ RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ),
+ $ IWORK( INDIWK ), INFO )
+*
+ IF( WANTZ ) THEN
+ CALL CSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+ $ RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO )
+*
+* Apply unitary matrix used in reduction to tridiagonal
+* form to eigenvectors returned by CSTEIN.
+*
+ CALL CUNMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+ $ LDZ, WORK( INDWRK ), LLWORK, IINFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ 40 CONTINUE
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = M
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* If eigenvalues are not in order, then sort them, along with
+* eigenvectors.
+*
+ IF( WANTZ ) THEN
+ DO 60 J = 1, M - 1
+ I = 0
+ TMP1 = W( J )
+ DO 50 JJ = J + 1, M
+ IF( W( JJ ).LT.TMP1 ) THEN
+ I = JJ
+ TMP1 = W( JJ )
+ END IF
+ 50 CONTINUE
+*
+ IF( I.NE.0 ) THEN
+ ITMP1 = IWORK( INDIBL+I-1 )
+ W( I ) = W( J )
+ IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+ W( J ) = TMP1
+ IWORK( INDIBL+J-1 ) = ITMP1
+ CALL CSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+ IF( INFO.NE.0 ) THEN
+ ITMP1 = IFAIL( I )
+ IFAIL( I ) = IFAIL( J )
+ IFAIL( J ) = ITMP1
+ END IF
+ END IF
+ 60 CONTINUE
+ END IF
+*
+* Set WORK(1) to optimal complex workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of CHEEVX_2STAGE
+*
+ END
diff --git a/SRC/chegv_2stage.f b/SRC/chegv_2stage.f
new file mode 100644
index 00000000..71d58d74
--- /dev/null
+++ b/SRC/chegv_2stage.f
@@ -0,0 +1,379 @@
+*> \brief \b CHEGV_2STAGE
+*
+* @generated from zhegv_2stage.f, fortran z -> c, Sun Nov 6 13:09:52 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHEGV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chegv_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chegv_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chegv_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CHEGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+* WORK, LWORK, RWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, ITYPE, LDA, LDB, LWORK, N
+* ..
+* .. Array Arguments ..
+* REAL RWORK( * ), W( * )
+* COMPLEX A( LDA, * ), B( LDB, * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CHEGV_2STAGE computes all the eigenvalues, and optionally, the eigenvectors
+*> of a complex generalized Hermitian-definite eigenproblem, of the form
+*> A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x.
+*> Here A and B are assumed to be Hermitian and B is also
+*> positive definite.
+*> This routine use the 2stage technique for the reduction to tridiagonal
+*> which showed higher performance on recent architecture and for large
+* sizes N>2000.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] ITYPE
+*> \verbatim
+*> ITYPE is INTEGER
+*> Specifies the problem type to be solved:
+*> = 1: A*x = (lambda)*B*x
+*> = 2: A*B*x = (lambda)*x
+*> = 3: B*A*x = (lambda)*x
+*> \endverbatim
+*>
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangles of A and B are stored;
+*> = 'L': Lower triangles of A and B are stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrices A and B. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX array, dimension (LDA, N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*>
+*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*> matrix Z of eigenvectors. The eigenvectors are normalized
+*> as follows:
+*> if ITYPE = 1 or 2, Z**H*B*Z = I;
+*> if ITYPE = 3, Z**H*inv(B)*Z = I.
+*> If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
+*> or the lower triangle (if UPLO='L') of A, including the
+*> diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*> B is COMPLEX array, dimension (LDB, N)
+*> On entry, the Hermitian positive definite matrix B.
+*> If UPLO = 'U', the leading N-by-N upper triangular part of B
+*> contains the upper triangular part of the matrix B.
+*> If UPLO = 'L', the leading N-by-N lower triangular part of B
+*> contains the lower triangular part of the matrix B.
+*>
+*> On exit, if INFO <= N, the part of B containing the matrix is
+*> overwritten by the triangular factor U or L from the Cholesky
+*> factorization B = U**H*U or B = L*L**H.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*> LDB is INTEGER
+*> The leading dimension of the array B. LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is REAL array, dimension (max(1, 3*N-2))
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: CPOTRF or CHEEV returned an error code:
+*> <= N: if INFO = i, CHEEV failed to converge;
+*> i off-diagonal elements of an intermediate
+*> tridiagonal form did not converge to zero;
+*> > N: if INFO = N + i, for 1 <= i <= N, then the leading
+*> minor of order i of B is not positive definite.
+*> The factorization of B could not be completed and
+*> no eigenvalues or eigenvectors were computed.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexHEeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE CHEGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+ $ WORK, LWORK, RWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, ITYPE, LDA, LDB, LWORK, N
+* ..
+* .. Array Arguments ..
+ REAL RWORK( * ), W( * )
+ COMPLEX A( LDA, * ), B( LDB, * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ COMPLEX ONE
+ PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, UPPER, WANTZ
+ CHARACTER TRANS
+ INTEGER NEIG, LWMIN, LHTRD, LWTRD, KD, IB
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL XERBLA, CHEGST, CPOTRF, CTRMM, CTRSM,
+ $ CHEEV_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( ITYPE.LT.1 .OR. ITYPE.GT.3 ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( UPPER .OR. LSAME( UPLO, 'L' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -6
+ ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
+ INFO = -8
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ KD = ILAENV( 17, 'CHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -11
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHEGV_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+* Form a Cholesky factorization of B.
+*
+ CALL CPOTRF( UPLO, N, B, LDB, INFO )
+ IF( INFO.NE.0 ) THEN
+ INFO = N + INFO
+ RETURN
+ END IF
+*
+* Transform problem to standard eigenvalue problem and solve.
+*
+ CALL CHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
+ CALL CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W,
+ $ WORK, LWORK, RWORK, INFO )
+*
+ IF( WANTZ ) THEN
+*
+* Backtransform eigenvectors to the original problem.
+*
+ NEIG = N
+ IF( INFO.GT.0 )
+ $ NEIG = INFO - 1
+ IF( ITYPE.EQ.1 .OR. ITYPE.EQ.2 ) THEN
+*
+* For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
+* backtransform eigenvectors: x = inv(L)**H *y or inv(U)*y
+*
+ IF( UPPER ) THEN
+ TRANS = 'N'
+ ELSE
+ TRANS = 'C'
+ END IF
+*
+ CALL CTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+ $ B, LDB, A, LDA )
+*
+ ELSE IF( ITYPE.EQ.3 ) THEN
+*
+* For B*A*x=(lambda)*x;
+* backtransform eigenvectors: x = L*y or U**H *y
+*
+ IF( UPPER ) THEN
+ TRANS = 'C'
+ ELSE
+ TRANS = 'N'
+ END IF
+*
+ CALL CTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+ $ B, LDB, A, LDA )
+ END IF
+ END IF
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of CHEGV_2STAGE
+*
+ END
diff --git a/SRC/chetrd_2stage.f b/SRC/chetrd_2stage.f
new file mode 100644
index 00000000..795462c6
--- /dev/null
+++ b/SRC/chetrd_2stage.f
@@ -0,0 +1,337 @@
+*> \brief \b CHETRD_2STAGE
+*
+* @generated from zhetrd_2stage.f, fortran z -> c, Sun Nov 6 19:34:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHETRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
+* HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER VECT, UPLO
+* INTEGER N, LDA, LWORK, LHOUS2, INFO
+* ..
+* .. Array Arguments ..
+* REAL D( * ), E( * )
+* COMPLEX A( LDA, * ), TAU( * ),
+* HOUS2( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CHETRD_2STAGE reduces a complex Hermitian matrix A to real symmetric
+*> tridiagonal form T by a unitary similarity transformation:
+*> Q1**H Q2**H* A * Q2 * Q1 = T.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] VECT
+*> \verbatim
+*> VECT is CHARACTER*1
+*> = 'N': No need for the Housholder representation,
+*> in particular for the second stage (Band to
+*> tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
+*> = 'V': the Householder representation is needed to
+*> either generate Q1 Q2 or to apply Q1 Q2,
+*> then LHOUS2 is to be queried and computed.
+*> (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX array, dimension (LDA,N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the leading
+*> N-by-N upper triangular part of A contains the upper
+*> triangular part of the matrix A, and the strictly lower
+*> triangular part of A is not referenced. If UPLO = 'L', the
+*> leading N-by-N lower triangular part of A contains the lower
+*> triangular part of the matrix A, and the strictly upper
+*> triangular part of A is not referenced.
+*> On exit, if UPLO = 'U', the band superdiagonal
+*> of A are overwritten by the corresponding elements of the
+*> internal band-diagonal matrix AB, and the elements above
+*> the KD superdiagonal, with the array TAU, represent the unitary
+*> matrix Q1 as a product of elementary reflectors; if UPLO
+*> = 'L', the diagonal and band subdiagonal of A are over-
+*> written by the corresponding elements of the internal band-diagonal
+*> matrix AB, and the elements below the KD subdiagonal, with
+*> the array TAU, represent the unitary matrix Q1 as a product
+*> of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*> D is REAL array, dimension (N)
+*> The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*> E is REAL array, dimension (N-1)
+*> The off-diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*> TAU is COMPLEX array, dimension (N-KD)
+*> The scalar factors of the elementary reflectors of
+*> the first stage (see Further Details).
+*> \endverbatim
+*>
+*> \param[out] HOUS2
+*> \verbatim
+*> HOUS2 is COMPLEX array, dimension LHOUS2, that
+*> store the Householder representation of the stage2
+*> band to tridiagonal.
+*> \endverbatim
+*>
+*> \param[in] LHOUS2
+*> \verbatim
+*> LHOUS2 is INTEGER
+*> The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS2=-1,
+*> then a query is assumed; the routine
+*> only calculates the optimal size of the HOUS2 array, returns
+*> this value as the first entry of the HOUS2 array, and no error
+*> message related to LHOUS2 is issued by XERBLA.
+*> LHOUS2 = MAX(1, dimension) where
+*> dimension = 4*N if VECT='N'
+*> not available now if VECT='H'
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK. LWORK = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS2=-1,
+*> then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexHEcomputational
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
+ $ HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER VECT, UPLO
+ INTEGER N, LDA, LWORK, LHOUS2, INFO
+* ..
+* .. Array Arguments ..
+ REAL D( * ), E( * )
+ COMPLEX A( LDA, * ), TAU( * ),
+ $ HOUS2( * ), WORK( * )
+* ..
+*
+* =====================================================================
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, UPPER, WANTQ
+ INTEGER KD, IB, LWMIN, LHMIN, LWRK, LDAB, WPOS, ABPOS
+* ..
+* .. External Subroutines ..
+ EXTERNAL XERBLA, CHETRD_HE2HB, CHETRD_HB2ST
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters
+*
+ INFO = 0
+ WANTQ = LSAME( VECT, 'V' )
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS2.EQ.-1 )
+*
+* Determine the block size, the workspace size and the hous size.
+*
+ KD = ILAENV( 17, 'CHETRD_2STAGE', VECT, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'CHETRD_2STAGE', VECT, N, KD, -1, -1 )
+ LHMIN = ILAENV( 19, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
+ LWMIN = ILAENV( 20, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
+* WRITE(*,*),'CHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
+* $ LHMIN, LWMIN
+*
+ IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ ELSE IF( LHOUS2.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -10
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -12
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ HOUS2( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHETRD_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 ) THEN
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+* Determine pointer position
+*
+ LDAB = KD+1
+ LWRK = LWORK-LDAB*N
+ ABPOS = 1
+ WPOS = ABPOS + LDAB*N
+ CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
+ $ TAU, WORK( WPOS ), LWRK, INFO )
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHETRD_HE2HB', -INFO )
+ RETURN
+ END IF
+ CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD,
+ $ WORK( ABPOS ), LDAB, D, E,
+ $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHETRD_HB2ST', -INFO )
+ RETURN
+ END IF
+*
+*
+ HOUS2( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of CHETRD_2STAGE
+*
+ END
diff --git a/SRC/chetrd_hb2st.F b/SRC/chetrd_hb2st.F
new file mode 100644
index 00000000..6f253278
--- /dev/null
+++ b/SRC/chetrd_hb2st.F
@@ -0,0 +1,603 @@
+*> \brief \b CHBTRD
+*
+* @generated from zhetrd_hb2st.F, fortran z -> c, Sun Nov 6 19:34:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHBTRD + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbtrd.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbtrd.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbtrd.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
+* D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+* #define PRECISION_COMPLEX
+*
+* #if defined(_OPENMP)
+* use omp_lib
+* #endif
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER STAGE1, UPLO, VECT
+* INTEGER N, KD, IB, LDAB, LHOUS, LWORK, INFO
+* ..
+* .. Array Arguments ..
+* REAL D( * ), E( * )
+* COMPLEX AB( LDAB, * ), HOUS( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CHBTRD reduces a complex Hermitian band matrix A to real symmetric
+*> tridiagonal form T by a unitary similarity transformation:
+*> Q**H * A * Q = T.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] STAGE
+*> \verbatim
+*> STAGE is CHARACTER*1
+*> = 'N': "No": to mention that the stage 1 of the reduction
+*> from dense to band using the chetrd_he2hb routine
+*> was not called before this routine to reproduce AB.
+*> In other term this routine is called as standalone.
+*> = 'Y': "Yes": to mention that the stage 1 of the
+*> reduction from dense to band using the chetrd_he2hb
+*> routine has been called to produce AB (e.g., AB is
+*> the output of chetrd_he2hb.
+*> \endverbatim
+*>
+*> \param[in] VECT
+*> \verbatim
+*> VECT is CHARACTER*1
+*> = 'N': No need for the Housholder representation,
+*> and thus LHOUS is of size max(1, 4*N);
+*> = 'V': the Householder representation is needed to
+*> either generate or to apply Q later on,
+*> then LHOUS is to be queried and computed.
+*> (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is COMPLEX array, dimension (LDAB,N)
+*> On entry, the upper or lower triangle of the Hermitian band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*> On exit, the diagonal elements of AB are overwritten by the
+*> diagonal elements of the tridiagonal matrix T; if KD > 0, the
+*> elements on the first superdiagonal (if UPLO = 'U') or the
+*> first subdiagonal (if UPLO = 'L') are overwritten by the
+*> off-diagonal elements of T; the rest of AB is overwritten by
+*> values generated during the reduction.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*> D is REAL array, dimension (N)
+*> The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*> E is REAL array, dimension (N-1)
+*> The off-diagonal elements of the tridiagonal matrix T:
+*> E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
+*> \endverbatim
+*>
+*> \param[out] HOUS
+*> \verbatim
+*> HOUS is COMPLEX array, dimension LHOUS, that
+*> store the Householder representation.
+*> \endverbatim
+*>
+*> \param[in] LHOUS
+*> \verbatim
+*> LHOUS is INTEGER
+*> The dimension of the array HOUS. LHOUS = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS=-1,
+*> then a query is assumed; the routine
+*> only calculates the optimal size of the HOUS array, returns
+*> this value as the first entry of the HOUS array, and no error
+*> message related to LHOUS is issued by XERBLA.
+*> LHOUS = MAX(1, dimension) where
+*> dimension = 4*N if VECT='N'
+*> not available now if VECT='H'
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK. LWORK = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS=-1,
+*> then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexOTHERcomputational
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
+ $ D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+#define PRECISION_COMPLEX
+*
+#if defined(_OPENMP)
+ use omp_lib
+#endif
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER STAGE1, UPLO, VECT
+ INTEGER N, KD, LDAB, LHOUS, LWORK, INFO
+* ..
+* .. Array Arguments ..
+ REAL D( * ), E( * )
+ COMPLEX AB( LDAB, * ), HOUS( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL RZERO
+ COMPLEX ZERO, ONE
+ PARAMETER ( RZERO = 0.0E+0,
+ $ ZERO = ( 0.0E+0, 0.0E+0 ),
+ $ ONE = ( 1.0E+0, 0.0E+0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, WANTQ, UPPER, AFTERS1
+ INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
+ $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
+ $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
+ $ NBTILES, TTYPE, TID, NTHREADS, DEBUG,
+ $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
+ $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
+ $ SICEV, SIZETAU, LDV, LHMIN, LWMIN
+#if defined (PRECISION_COMPLEX)
+ REAL ABSTMP
+ COMPLEX TMP
+#endif
+* ..
+* .. External Subroutines ..
+ EXTERNAL CHB2ST_KERNELS, CLACPY, CLASET
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN, MAX, CEILING, REAL
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Determine the minimal workspace size required.
+* Test the input parameters
+*
+ DEBUG = 0
+ INFO = 0
+ AFTERS1 = LSAME( STAGE1, 'Y' )
+ WANTQ = LSAME( VECT, 'V' )
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 )
+*
+* Determine the block size, the workspace size and the hous size.
+*
+ IB = ILAENV( 18, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
+ LHMIN = ILAENV( 19, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+ LWMIN = ILAENV( 20, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+*
+ IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -5
+ ELSE IF( LDAB.LT.(KD+1) ) THEN
+ INFO = -7
+ ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -11
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -13
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ HOUS( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHETRD_HB2ST', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 ) THEN
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+* Determine pointer position
+*
+ LDV = KD + IB
+ SIZETAU = 2 * N
+ SICEV = 2 * N
+ INDTAU = 1
+ INDV = INDTAU + SIZETAU
+ LDA = 2 * KD + 1
+ SIZEA = LDA * N
+ INDA = 1
+ INDW = INDA + SIZEA
+ NTHREADS = 1
+ TID = 0
+*
+ IF( UPPER ) THEN
+ APOS = INDA + KD
+ AWPOS = INDA
+ DPOS = APOS + KD
+ OFDPOS = DPOS - 1
+ ABDPOS = KD + 1
+ ABOFDPOS = KD
+ ELSE
+ APOS = INDA
+ AWPOS = INDA + KD + 1
+ DPOS = APOS
+ OFDPOS = DPOS + 1
+ ABDPOS = 1
+ ABOFDPOS = 2
+
+ ENDIF
+*
+* Case KD=0:
+* The matrix is diagonal. We just copy it (convert to "real" for
+* complex because D is double and the imaginary part should be 0)
+* and store it in D. A sequential code here is better or
+* in a parallel environment it might need two cores for D and E
+*
+ IF( KD.EQ.0 ) THEN
+ DO 30 I = 1, N
+ D( I ) = REAL( AB( ABDPOS, I ) )
+ 30 CONTINUE
+ DO 40 I = 1, N-1
+ E( I ) = RZERO
+ 40 CONTINUE
+ GOTO 200
+ END IF
+*
+* Case KD=1:
+* The matrix is already Tridiagonal. We have to make diagonal
+* and offdiagonal elements real, and store them in D and E.
+* For that, for real precision just copy the diag and offdiag
+* to D and E while for the COMPLEX case the bulge chasing is
+* performed to convert the hermetian tridiagonal to symmetric
+* tridiagonal. A simpler coversion formula might be used, but then
+* updating the Q matrix will be required and based if Q is generated
+* or not this might complicate the story.
+*
+C IF( KD.EQ.1 .AND. N.GT.(KD+1) .AND. AFTERS1 ) THEN
+ IF( KD.EQ.1 ) THEN
+ DO 50 I = 1, N
+ D( I ) = REAL( AB( ABDPOS, I ) )
+ 50 CONTINUE
+#if defined (PRECISION_COMPLEX)
+*
+* make off-diagonal elements real and copy them to E
+*
+ IF( UPPER ) THEN
+ DO 60 I = 1, N - 1
+ TMP = AB( ABOFDPOS, I+1 )
+ ABSTMP = ABS( TMP )
+ AB( ABOFDPOS, I+1 ) = ABSTMP
+ E( I ) = ABSTMP
+ IF( ABSTMP.NE.RZERO ) THEN
+ TMP = TMP / ABSTMP
+ ELSE
+ TMP = ONE
+ END IF
+ IF( I.LT.N-1 )
+ $ AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP
+C IF( WANTZ ) THEN
+C CALL CSCAL( N, CONJG( TMP ), Q( 1, I+1 ), 1 )
+C END IF
+ 60 CONTINUE
+ ELSE
+ DO 70 I = 1, N - 1
+ TMP = AB( ABOFDPOS, I )
+ ABSTMP = ABS( TMP )
+ AB( ABOFDPOS, I ) = ABSTMP
+ E( I ) = ABSTMP
+ IF( ABSTMP.NE.RZERO ) THEN
+ TMP = TMP / ABSTMP
+ ELSE
+ TMP = ONE
+ END IF
+ IF( I.LT.N-1 )
+ $ AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP
+C IF( WANTQ ) THEN
+C CALL CSCAL( N, TMP, Q( 1, I+1 ), 1 )
+C END IF
+ 70 CONTINUE
+ ENDIF
+#else
+ IF( UPPER ) THEN
+ DO 60 I = 1, N-1
+ E( I ) = REAL( AB( ABOFDPOS, I+1 ) )
+ 60 CONTINUE
+ ELSE
+ DO 70 I = 1, N-1
+ E( I ) = REAL( AB( ABOFDPOS, I ) )
+ 70 CONTINUE
+ ENDIF
+#endif
+ GOTO 200
+ END IF
+*
+* Main code start here.
+* Reduce the hermitian band of A to a tridiagonal matrix.
+*
+ THGRSIZ = N
+ GRSIZ = 1
+ SHIFT = 3
+ NBTILES = CEILING( REAL(N)/REAL(KD) )
+ STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
+ THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) )
+*
+ CALL CLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
+ CALL CLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA )
+*
+*
+* openMP parallelisation start here
+*
+#if defined(_OPENMP)
+!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
+!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
+!$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
+!$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
+!$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
+!$OMP$ SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+!$OMP MASTER
+#endif
+*
+* main bulge chasing loop
+*
+ DO 100 THGRID = 1, THGRNB
+ STT = (THGRID-1)*THGRSIZ+1
+ THED = MIN( (STT + THGRSIZ -1), (N-1))
+ DO 110 I = STT, N-1
+ ED = MIN( I, THED )
+ IF( STT.GT.ED ) GOTO 100
+ DO 120 M = 1, STEPERCOL
+ ST = STT
+ DO 130 SWEEPID = ST, ED
+ DO 140 K = 1, GRSIZ
+ MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ)
+ $ + (M-1)*GRSIZ + K
+ IF ( MYID.EQ.1 ) THEN
+ TTYPE = 1
+ ELSE
+ TTYPE = MOD( MYID, 2 ) + 2
+ ENDIF
+
+ IF( TTYPE.EQ.2 ) THEN
+ COLPT = (MYID/2)*KD + SWEEPID
+ STIND = COLPT-KD+1
+ EDIND = MIN(COLPT,N)
+ BLKLASTIND = COLPT
+ ELSE
+ COLPT = ((MYID+1)/2)*KD + SWEEPID
+ STIND = COLPT-KD+1
+ EDIND = MIN(COLPT,N)
+ IF( ( STIND.GE.EDIND-1 ).AND.
+ $ ( EDIND.EQ.N ) ) THEN
+ BLKLASTIND = N
+ ELSE
+ BLKLASTIND = 0
+ ENDIF
+ ENDIF
+*
+* Call the kernel
+*
+#if defined(_OPENMP)
+ IF( TTYPE.NE.1 ) THEN
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$ DEPEND(in:WORK(MYID-1))
+!$OMP$ DEPEND(out:WORK(MYID))
+ TID = OMP_GET_THREAD_NUM()
+ CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
+ $ STIND, EDIND, SWEEPID, N, KD, IB,
+ $ WORK ( INDA ), LDA,
+ $ HOUS( INDV ), HOUS( INDTAU ), LDV,
+ $ WORK( INDW + TID*KD ) )
+!$OMP END TASK
+ ELSE
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$ DEPEND(out:WORK(MYID))
+ TID = OMP_GET_THREAD_NUM()
+ CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
+ $ STIND, EDIND, SWEEPID, N, KD, IB,
+ $ WORK ( INDA ), LDA,
+ $ HOUS( INDV ), HOUS( INDTAU ), LDV,
+ $ WORK( INDW + TID*KD ) )
+!$OMP END TASK
+ ENDIF
+#else
+ CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
+ $ STIND, EDIND, SWEEPID, N, KD, IB,
+ $ WORK ( INDA ), LDA,
+ $ HOUS( INDV ), HOUS( INDTAU ), LDV,
+ $ WORK( INDW + TID*KD ) )
+#endif
+ IF ( BLKLASTIND.GE.(N-1) ) THEN
+ STT = STT + 1
+ GOTO 130
+ ENDIF
+ 140 CONTINUE
+ 130 CONTINUE
+ 120 CONTINUE
+ 110 CONTINUE
+ 100 CONTINUE
+*
+#if defined(_OPENMP)
+!$OMP END MASTER
+!$OMP END PARALLEL
+#endif
+*
+* Copy the diagonal from A to D. Note that D is REAL thus only
+* the Real part is needed, the imaginary part should be zero.
+*
+ DO 150 I = 1, N
+ D( I ) = REAL( WORK( DPOS+(I-1)*LDA ) )
+ 150 CONTINUE
+*
+* Copy the off diagonal from A to E. Note that E is REAL thus only
+* the Real part is needed, the imaginary part should be zero.
+*
+ IF( UPPER ) THEN
+ DO 160 I = 1, N-1
+ E( I ) = REAL( WORK( OFDPOS+I*LDA ) )
+ 160 CONTINUE
+ ELSE
+ DO 170 I = 1, N-1
+ E( I ) = REAL( WORK( OFDPOS+(I-1)*LDA ) )
+ 170 CONTINUE
+ ENDIF
+*
+ 200 CONTINUE
+*
+ HOUS( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of CHETRD_HB2ST
+*
+ END
+#undef PRECISION_COMPLEX
+
diff --git a/SRC/chetrd_he2hb.f b/SRC/chetrd_he2hb.f
new file mode 100644
index 00000000..28f5dc60
--- /dev/null
+++ b/SRC/chetrd_he2hb.f
@@ -0,0 +1,517 @@
+*> \brief \b CHETRD_HE2HB
+*
+* @generated from zhetrd_he2hb.f, fortran z -> c, Sun Nov 6 19:34:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHETRD_HE2HB + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CHETRD_HE2HB( UPLO, N, KD, A, LDA, AB, LDAB, TAU,
+* WORK, LWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER UPLO
+* INTEGER INFO, LDA, LDAB, LWORK, N, KD
+* ..
+* .. Array Arguments ..
+* COMPLEX A( LDA, * ), AB( LDAB, * ),
+* TAU( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CHETRD_HE2HB reduces a complex Hermitian matrix A to complex Hermitian
+*> band-diagonal form AB by a unitary similarity transformation:
+*> Q**H * A * Q = AB.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the reduced matrix if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> The reduced matrix is stored in the array AB.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX array, dimension (LDA,N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the leading
+*> N-by-N upper triangular part of A contains the upper
+*> triangular part of the matrix A, and the strictly lower
+*> triangular part of A is not referenced. If UPLO = 'L', the
+*> leading N-by-N lower triangular part of A contains the lower
+*> triangular part of the matrix A, and the strictly upper
+*> triangular part of A is not referenced.
+*> On exit, if UPLO = 'U', the diagonal and first superdiagonal
+*> of A are overwritten by the corresponding elements of the
+*> tridiagonal matrix T, and the elements above the first
+*> superdiagonal, with the array TAU, represent the unitary
+*> matrix Q as a product of elementary reflectors; if UPLO
+*> = 'L', the diagonal and first subdiagonal of A are over-
+*> written by the corresponding elements of the tridiagonal
+*> matrix T, and the elements below the first subdiagonal, with
+*> the array TAU, represent the unitary matrix Q as a product
+*> of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] AB
+*> \verbatim
+*> AB is COMPLEX array, dimension (LDAB,N)
+*> On exit, the upper or lower triangle of the Hermitian band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*> TAU is COMPLEX array, dimension (N-KD)
+*> The scalar factors of the elementary reflectors (see Further
+*> Details).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX array, dimension LWORK.
+*> On exit, if INFO = 0, or if LWORK=-1,
+*> WORK(1) returns the size of LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK which should be calculated
+* by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> LWORK_QUERY = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD
+*> where FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice otherwise
+*> putting LWORK=-1 will provide the size of WORK.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexHEcomputational
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+*> \verbatim
+*>
+*> If UPLO = 'U', the matrix Q is represented as a product of elementary
+*> reflectors
+*>
+*> Q = H(k)**H . . . H(2)**H H(1)**H, where k = n-kd.
+*>
+*> Each H(i) has the form
+*>
+*> H(i) = I - tau * v * v**H
+*>
+*> where tau is a complex scalar, and v is a complex vector with
+*> v(1:i+kd-1) = 0 and v(i+kd) = 1; conjg(v(i+kd+1:n)) is stored on exit in
+*> A(i,i+kd+1:n), and tau in TAU(i).
+*>
+*> If UPLO = 'L', the matrix Q is represented as a product of elementary
+*> reflectors
+*>
+*> Q = H(1) H(2) . . . H(k), where k = n-kd.
+*>
+*> Each H(i) has the form
+*>
+*> H(i) = I - tau * v * v**H
+*>
+*> where tau is a complex scalar, and v is a complex vector with
+*> v(kd+1:i) = 0 and v(i+kd+1) = 1; v(i+kd+2:n) is stored on exit in
+* A(i+kd+2:n,i), and tau in TAU(i).
+*>
+*> The contents of A on exit are illustrated by the following examples
+*> with n = 5:
+*>
+*> if UPLO = 'U': if UPLO = 'L':
+*>
+*> ( ab ab/v1 v1 v1 v1 ) ( ab )
+*> ( ab ab/v2 v2 v2 ) ( ab/v1 ab )
+*> ( ab ab/v3 v3 ) ( v1 ab/v2 ab )
+*> ( ab ab/v4 ) ( v1 v2 ab/v3 ab )
+*> ( ab ) ( v1 v2 v3 ab/v4 ab )
+*>
+*> where d and e denote diagonal and off-diagonal elements of T, and vi
+*> denotes an element of the vector defining H(i).
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE CHETRD_HE2HB( UPLO, N, KD, A, LDA, AB, LDAB, TAU,
+ $ WORK, LWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER UPLO
+ INTEGER INFO, LDA, LDAB, LWORK, N, KD
+* ..
+* .. Array Arguments ..
+ COMPLEX A( LDA, * ), AB( LDAB, * ),
+ $ TAU( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL RONE
+ COMPLEX ZERO, ONE, HALF
+ PARAMETER ( RONE = 1.0E+0,
+ $ ZERO = ( 0.0E+0, 0.0E+0 ),
+ $ ONE = ( 1.0E+0, 0.0E+0 ),
+ $ HALF = ( 0.5E+0, 0.0E+0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, UPPER
+ INTEGER I, J, IINFO, LWMIN, PN, PK, LK,
+ $ LDT, LDW, LDS2, LDS1,
+ $ LS2, LS1, LW, LT,
+ $ TPOS, WPOS, S2POS, S1POS
+* ..
+* .. External Subroutines ..
+ EXTERNAL XERBLA, CHER2K, CHEMM, CGEMM,
+ $ CLARFT, CGELQF, CGEQRF, CLASET
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN, MAX
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Determine the minimal workspace size required
+* and test the input parameters
+*
+ INFO = 0
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 )
+ LWMIN = ILAENV( 20, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
+
+ IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+ INFO = -1
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -2
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ ELSE IF( LDAB.LT.MAX( 1, KD+1 ) ) THEN
+ INFO = -7
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -10
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'CHETRD_HE2HB', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ WORK( 1 ) = LWMIN
+ RETURN
+ END IF
+*
+* Quick return if possible
+* Copy the upper/lower portion of A into AB
+*
+ IF( N.LE.KD+1 ) THEN
+ IF( UPPER ) THEN
+ DO 100 I = 1, N
+ LK = MIN( KD+1, I )
+ CALL CCOPY( LK, A( I-LK+1, I ), 1,
+ $ AB( KD+1-LK+1, I ), 1 )
+ 100 CONTINUE
+ ELSE
+ DO 110 I = 1, N
+ LK = MIN( KD+1, N-I+1 )
+ CALL CCOPY( LK, A( I, I ), 1, AB( 1, I ), 1 )
+ 110 CONTINUE
+ ENDIF
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+* Determine the pointer position for the workspace
+*
+ LDT = KD
+ LDS1 = KD
+ LT = LDT*KD
+ LW = N*KD
+ LS1 = LDS1*KD
+ LS2 = LWMIN - LT - LW - LS1
+* LS2 = N*MAX(KD,FACTOPTNB)
+ TPOS = 1
+ WPOS = TPOS + LT
+ S1POS = WPOS + LW
+ S2POS = S1POS + LS1
+ IF( UPPER ) THEN
+ LDW = KD
+ LDS2 = KD
+ ELSE
+ LDW = N
+ LDS2 = N
+ ENDIF
+*
+*
+* Set the workspace of the triangular matrix T to zero once such a
+* way everytime T is generated the upper/lower portion will be always zero
+*
+ CALL CLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT )
+*
+ IF( UPPER ) THEN
+ DO 10 I = 1, N - KD, KD
+ PN = N-I-KD+1
+ PK = MIN( N-I-KD+1, KD )
+*
+* Compute the LQ factorization of the current block
+*
+ CALL CGELQF( KD, PN, A( I, I+KD ), LDA,
+ $ TAU( I ), WORK( S2POS ), LS2, IINFO )
+*
+* Copy the upper portion of A into AB
+*
+ DO 20 J = I, I+PK-1
+ LK = MIN( KD, N-J ) + 1
+ CALL CCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+ 20 CONTINUE
+*
+ CALL CLASET( 'Lower', PK, PK, ZERO, ONE,
+ $ A( I, I+KD ), LDA )
+*
+* Form the matrix T
+*
+ CALL CLARFT( 'Forward', 'Rowwise', PN, PK,
+ $ A( I, I+KD ), LDA, TAU( I ),
+ $ WORK( TPOS ), LDT )
+*
+* Compute W:
+*
+ CALL CGEMM( 'Conjugate', 'No transpose', PK, PN, PK,
+ $ ONE, WORK( TPOS ), LDT,
+ $ A( I, I+KD ), LDA,
+ $ ZERO, WORK( S2POS ), LDS2 )
+*
+ CALL CHEMM( 'Right', UPLO, PK, PN,
+ $ ONE, A( I+KD, I+KD ), LDA,
+ $ WORK( S2POS ), LDS2,
+ $ ZERO, WORK( WPOS ), LDW )
+*
+ CALL CGEMM( 'No transpose', 'Conjugate', PK, PK, PN,
+ $ ONE, WORK( WPOS ), LDW,
+ $ WORK( S2POS ), LDS2,
+ $ ZERO, WORK( S1POS ), LDS1 )
+*
+ CALL CGEMM( 'No transpose', 'No transpose', PK, PN, PK,
+ $ -HALF, WORK( S1POS ), LDS1,
+ $ A( I, I+KD ), LDA,
+ $ ONE, WORK( WPOS ), LDW )
+*
+*
+* Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+* an update of the form: A := A - V'*W - W'*V
+*
+ CALL CHER2K( UPLO, 'Conjugate', PN, PK,
+ $ -ONE, A( I, I+KD ), LDA,
+ $ WORK( WPOS ), LDW,
+ $ RONE, A( I+KD, I+KD ), LDA )
+ 10 CONTINUE
+*
+* Copy the upper band to AB which is the band storage matrix
+*
+ DO 30 J = N-KD+1, N
+ LK = MIN(KD, N-J) + 1
+ CALL CCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+ 30 CONTINUE
+*
+ ELSE
+*
+* Reduce the lower triangle of A to lower band matrix
+*
+ DO 40 I = 1, N - KD, KD
+ PN = N-I-KD+1
+ PK = MIN( N-I-KD+1, KD )
+*
+* Compute the QR factorization of the current block
+*
+ CALL CGEQRF( PN, KD, A( I+KD, I ), LDA,
+ $ TAU( I ), WORK( S2POS ), LS2, IINFO )
+*
+* Copy the upper portion of A into AB
+*
+ DO 50 J = I, I+PK-1
+ LK = MIN( KD, N-J ) + 1
+ CALL CCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+ 50 CONTINUE
+*
+ CALL CLASET( 'Upper', PK, PK, ZERO, ONE,
+ $ A( I+KD, I ), LDA )
+*
+* Form the matrix T
+*
+ CALL CLARFT( 'Forward', 'Columnwise', PN, PK,
+ $ A( I+KD, I ), LDA, TAU( I ),
+ $ WORK( TPOS ), LDT )
+*
+* Compute W:
+*
+ CALL CGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+ $ ONE, A( I+KD, I ), LDA,
+ $ WORK( TPOS ), LDT,
+ $ ZERO, WORK( S2POS ), LDS2 )
+*
+ CALL CHEMM( 'Left', UPLO, PN, PK,
+ $ ONE, A( I+KD, I+KD ), LDA,
+ $ WORK( S2POS ), LDS2,
+ $ ZERO, WORK( WPOS ), LDW )
+*
+ CALL CGEMM( 'Conjugate', 'No transpose', PK, PK, PN,
+ $ ONE, WORK( S2POS ), LDS2,
+ $ WORK( WPOS ), LDW,
+ $ ZERO, WORK( S1POS ), LDS1 )
+*
+ CALL CGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+ $ -HALF, A( I+KD, I ), LDA,
+ $ WORK( S1POS ), LDS1,
+ $ ONE, WORK( WPOS ), LDW )
+*
+*
+* Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+* an update of the form: A := A - V*W' - W*V'
+*
+ CALL CHER2K( UPLO, 'No transpose', PN, PK,
+ $ -ONE, A( I+KD, I ), LDA,
+ $ WORK( WPOS ), LDW,
+ $ RONE, A( I+KD, I+KD ), LDA )
+* ==================================================================
+* RESTORE A FOR COMPARISON AND CHECKING TO BE REMOVED
+* DO 45 J = I, I+PK-1
+* LK = MIN( KD, N-J ) + 1
+* CALL CCOPY( LK, AB( 1, J ), 1, A( J, J ), 1 )
+* 45 CONTINUE
+* ==================================================================
+ 40 CONTINUE
+*
+* Copy the lower band to AB which is the band storage matrix
+*
+ DO 60 J = N-KD+1, N
+ LK = MIN(KD, N-J) + 1
+ CALL CCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+ 60 CONTINUE
+
+ END IF
+*
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of CHETRD_HE2HB
+*
+ END
diff --git a/SRC/clarfy.f b/SRC/clarfy.f
new file mode 100644
index 00000000..572a4723
--- /dev/null
+++ b/SRC/clarfy.f
@@ -0,0 +1,163 @@
+*> \brief \b CLARFY
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+* .. Scalar Arguments ..
+* CHARACTER UPLO
+* INTEGER INCV, LDC, N
+* COMPLEX TAU
+* ..
+* .. Array Arguments ..
+* COMPLEX C( LDC, * ), V( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CLARFY applies an elementary reflector, or Householder matrix, H,
+*> to an n x n Hermitian matrix C, from both the left and the right.
+*>
+*> H is represented in the form
+*>
+*> H = I - tau * v * v'
+*>
+*> where tau is a scalar and v is a vector.
+*>
+*> If tau is zero, then H is taken to be the unit matrix.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> Specifies whether the upper or lower triangular part of the
+*> Hermitian matrix C is stored.
+*> = 'U': Upper triangle
+*> = 'L': Lower triangle
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of rows and columns of the matrix C. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is COMPLEX array, dimension
+*> (1 + (N-1)*abs(INCV))
+*> The vector v as described above.
+*> \endverbatim
+*>
+*> \param[in] INCV
+*> \verbatim
+*> INCV is INTEGER
+*> The increment between successive elements of v. INCV must
+*> not be zero.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*> TAU is COMPLEX
+*> The value tau as described above.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is COMPLEX array, dimension (LDC, N)
+*> On entry, the matrix C.
+*> On exit, C is overwritten by H * C * H'.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> The leading dimension of the array C. LDC >= max( 1, N ).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX array, dimension (N)
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2011
+*
+*> \ingroup complex_eig
+*
+* =====================================================================
+ SUBROUTINE CLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+* -- LAPACK test routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2011
+*
+* .. Scalar Arguments ..
+ CHARACTER UPLO
+ INTEGER INCV, LDC, N
+ COMPLEX TAU
+* ..
+* .. Array Arguments ..
+ COMPLEX C( LDC, * ), V( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ COMPLEX ONE, ZERO, HALF
+ PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ),
+ $ ZERO = ( 0.0E+0, 0.0E+0 ),
+ $ HALF = ( 0.5E+0, 0.0E+0 ) )
+* ..
+* .. Local Scalars ..
+ COMPLEX ALPHA
+* ..
+* .. External Subroutines ..
+ EXTERNAL CAXPY, CHEMV, CHER2
+* ..
+* .. External Functions ..
+ COMPLEX CDOTC
+ EXTERNAL CDOTC
+* ..
+* .. Executable Statements ..
+*
+ IF( TAU.EQ.ZERO )
+ $ RETURN
+*
+* Form w:= C * v
+*
+ CALL CHEMV( UPLO, N, ONE, C, LDC, V, INCV, ZERO, WORK, 1 )
+*
+ ALPHA = -HALF*TAU*CDOTC( N, WORK, 1, V, INCV )
+ CALL CAXPY( N, ALPHA, V, INCV, WORK, 1 )
+*
+* C := C - v * w' - w * v'
+*
+ CALL CHER2( UPLO, N, -TAU, V, INCV, WORK, 1, C, LDC )
+*
+ RETURN
+*
+* End of CLARFY
+*
+ END
diff --git a/SRC/dlarfy.f b/SRC/dlarfy.f
new file mode 100644
index 00000000..089aa94e
--- /dev/null
+++ b/SRC/dlarfy.f
@@ -0,0 +1,161 @@
+*> \brief \b DLARFY
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+* .. Scalar Arguments ..
+* CHARACTER UPLO
+* INTEGER INCV, LDC, N
+* DOUBLE PRECISION TAU
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION C( LDC, * ), V( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DLARFY applies an elementary reflector, or Householder matrix, H,
+*> to an n x n symmetric matrix C, from both the left and the right.
+*>
+*> H is represented in the form
+*>
+*> H = I - tau * v * v'
+*>
+*> where tau is a scalar and v is a vector.
+*>
+*> If tau is zero, then H is taken to be the unit matrix.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> Specifies whether the upper or lower triangular part of the
+*> symmetric matrix C is stored.
+*> = 'U': Upper triangle
+*> = 'L': Lower triangle
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of rows and columns of the matrix C. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is DOUBLE PRECISION array, dimension
+*> (1 + (N-1)*abs(INCV))
+*> The vector v as described above.
+*> \endverbatim
+*>
+*> \param[in] INCV
+*> \verbatim
+*> INCV is INTEGER
+*> The increment between successive elements of v. INCV must
+*> not be zero.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*> TAU is DOUBLE PRECISION
+*> The value tau as described above.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is DOUBLE PRECISION array, dimension (LDC, N)
+*> On entry, the matrix C.
+*> On exit, C is overwritten by H * C * H'.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> The leading dimension of the array C. LDC >= max( 1, N ).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is DOUBLE PRECISION array, dimension (N)
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2011
+*
+*> \ingroup double_eig
+*
+* =====================================================================
+ SUBROUTINE DLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+* -- LAPACK test routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2011
+*
+* .. Scalar Arguments ..
+ CHARACTER UPLO
+ INTEGER INCV, LDC, N
+ DOUBLE PRECISION TAU
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION C( LDC, * ), V( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ONE, ZERO, HALF
+ PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0, HALF = 0.5D+0 )
+* ..
+* .. Local Scalars ..
+ DOUBLE PRECISION ALPHA
+* ..
+* .. External Subroutines ..
+ EXTERNAL DAXPY, DSYMV, DSYR2
+* ..
+* .. External Functions ..
+ DOUBLE PRECISION DDOT
+ EXTERNAL DDOT
+* ..
+* .. Executable Statements ..
+*
+ IF( TAU.EQ.ZERO )
+ $ RETURN
+*
+* Form w:= C * v
+*
+ CALL DSYMV( UPLO, N, ONE, C, LDC, V, INCV, ZERO, WORK, 1 )
+*
+ ALPHA = -HALF*TAU*DDOT( N, WORK, 1, V, INCV )
+ CALL DAXPY( N, ALPHA, V, INCV, WORK, 1 )
+*
+* C := C - v * w' - w * v'
+*
+ CALL DSYR2( UPLO, N, -TAU, V, INCV, WORK, 1, C, LDC )
+*
+ RETURN
+*
+* End of DLARFY
+*
+ END
diff --git a/SRC/dsb2st_kernels.f b/SRC/dsb2st_kernels.f
new file mode 100644
index 00000000..15d1186e
--- /dev/null
+++ b/SRC/dsb2st_kernels.f
@@ -0,0 +1,320 @@
+*> \brief \b DSB2ST_KERNELS
+*
+* @generated from zhb2st_kernels.f, fortran z -> d, Sun Nov 6 19:34:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSB2ST_KERNELS + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsb2st_kernels.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsb2st_kernels.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsb2st_kernels.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DSB2ST_KERNELS( UPLO, WANTZ, TTYPE,
+* ST, ED, SWEEP, N, NB, IB,
+* A, LDA, V, TAU, LDVT, WORK)
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER UPLO
+* LOGICAL WANTZ
+* INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION A( LDA, * ), V( * ),
+* TAU( * ), WORK( * )
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DSB2ST_KERNELS is an internal routine used by the DSYTRD_SB2ST
+*> subroutine.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> @param[in] n
+*> The order of the matrix A.
+*>
+*> @param[in] nb
+*> The size of the band.
+*>
+*> @param[in, out] A
+*> A pointer to the matrix A.
+*>
+*> @param[in] lda
+*> The leading dimension of the matrix A.
+*>
+*> @param[out] V
+*> DOUBLE PRECISION array, dimension 2*n if eigenvalues only are
+*> requested or to be queried for vectors.
+*>
+*> @param[out] TAU
+*> DOUBLE PRECISION array, dimension (2*n).
+*> The scalar factors of the Householder reflectors are stored
+*> in this array.
+*>
+*> @param[in] st
+*> internal parameter for indices.
+*>
+*> @param[in] ed
+*> internal parameter for indices.
+*>
+*> @param[in] sweep
+*> internal parameter for indices.
+*>
+*> @param[in] Vblksiz
+*> internal parameter for indices.
+*>
+*> @param[in] wantz
+*> logical which indicate if Eigenvalue are requested or both
+*> Eigenvalue/Eigenvectors.
+*>
+*> @param[in] work
+*> Workspace of size nb.
+*>
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE DSB2ST_KERNELS( UPLO, WANTZ, TTYPE,
+ $ ST, ED, SWEEP, N, NB, IB,
+ $ A, LDA, V, TAU, LDVT, WORK)
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER UPLO
+ LOGICAL WANTZ
+ INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION A( LDA, * ), V( * ),
+ $ TAU( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D+0,
+ $ ONE = 1.0D+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL UPPER
+ INTEGER I, J1, J2, LM, LN, VPOS, TAUPOS,
+ $ DPOS, OFDPOS, AJETER
+ DOUBLE PRECISION CTMP
+* ..
+* .. External Subroutines ..
+ EXTERNAL DLARFG, DLARFX, DLARFY
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MOD
+* .. External Functions ..
+ LOGICAL LSAME
+ EXTERNAL LSAME
+* ..
+* ..
+* .. Executable Statements ..
+*
+ AJETER = IB + LDVT
+ UPPER = LSAME( UPLO, 'U' )
+
+ IF( UPPER ) THEN
+ DPOS = 2 * NB + 1
+ OFDPOS = 2 * NB
+ ELSE
+ DPOS = 1
+ OFDPOS = 2
+ ENDIF
+
+*
+* Upper case
+*
+ IF( UPPER ) THEN
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ENDIF
+ GO TO ( 101, 102, 103 ) TTYPE
+*
+ 101 CONTINUE
+ LM = ED - ST + 1
+*
+ V( VPOS ) = ONE
+ DO 10 I = 1, LM-1
+ V( VPOS+I ) = ( A( OFDPOS-I, ST+I ) )
+ A( OFDPOS-I, ST+I ) = ZERO
+ 10 CONTINUE
+ CTMP = ( A( OFDPOS, ST ) )
+ CALL DLARFG( LM, CTMP, V( VPOS+1 ), 1,
+ $ TAU( TAUPOS ) )
+ A( OFDPOS, ST ) = CTMP
+*
+ 103 CONTINUE
+ LM = ED - ST + 1
+ CALL DLARFY( UPLO, LM, V( VPOS ), 1, ( TAU( TAUPOS ) ),
+ $ A( DPOS, ST ), LDA-1, WORK)
+ GOTO 300
+*
+ 102 CONTINUE
+ J1 = ED+1
+ J2 = MIN( ED+NB, N )
+ LN = ED-ST+1
+ LM = J2-J1+1
+ IF( LM.GT.0) THEN
+ CALL DLARFX( 'Left', LN, LM, V( VPOS ),
+ $ ( TAU( TAUPOS ) ), A( DPOS-NB, J1 ),
+ $ LDA-1, WORK)
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ENDIF
+*
+ V( VPOS ) = ONE
+ DO 30 I = 1, LM-1
+ V( VPOS+I ) = ( A( DPOS-NB-I, J1+I ) )
+ A( DPOS-NB-I, J1+I ) = ZERO
+ 30 CONTINUE
+ CTMP = ( A( DPOS-NB, J1 ) )
+ CALL DLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) )
+ A( DPOS-NB, J1 ) = CTMP
+*
+ CALL DLARFX( 'Right', LN-1, LM, V( VPOS ),
+ $ TAU( TAUPOS ),
+ $ A( DPOS-NB+1, J1 ), LDA-1, WORK)
+ ENDIF
+ GOTO 300
+*
+* Lower case
+*
+ ELSE
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ENDIF
+ GO TO ( 201, 202, 203 ) TTYPE
+*
+ 201 CONTINUE
+ LM = ED - ST + 1
+*
+ V( VPOS ) = ONE
+ DO 20 I = 1, LM-1
+ V( VPOS+I ) = A( OFDPOS+I, ST-1 )
+ A( OFDPOS+I, ST-1 ) = ZERO
+ 20 CONTINUE
+ CALL DLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1,
+ $ TAU( TAUPOS ) )
+*
+ 203 CONTINUE
+ LM = ED - ST + 1
+*
+ CALL DLARFY( UPLO, LM, V( VPOS ), 1, ( TAU( TAUPOS ) ),
+ $ A( DPOS, ST ), LDA-1, WORK)
+
+ GOTO 300
+*
+ 202 CONTINUE
+ J1 = ED+1
+ J2 = MIN( ED+NB, N )
+ LN = ED-ST+1
+ LM = J2-J1+1
+*
+ IF( LM.GT.0) THEN
+ CALL DLARFX( 'Right', LM, LN, V( VPOS ),
+ $ TAU( TAUPOS ), A( DPOS+NB, ST ),
+ $ LDA-1, WORK)
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ENDIF
+*
+ V( VPOS ) = ONE
+ DO 40 I = 1, LM-1
+ V( VPOS+I ) = A( DPOS+NB+I, ST )
+ A( DPOS+NB+I, ST ) = ZERO
+ 40 CONTINUE
+ CALL DLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1,
+ $ TAU( TAUPOS ) )
+*
+ CALL DLARFX( 'Left', LM, LN-1, V( VPOS ),
+ $ ( TAU( TAUPOS ) ),
+ $ A( DPOS+NB-1, ST+1 ), LDA-1, WORK)
+
+ ENDIF
+ GOTO 300
+ ENDIF
+
+ 300 CONTINUE
+ RETURN
+*
+* END OF DSB2ST_KERNELS
+*
+ END
diff --git a/SRC/dsbev_2stage.f b/SRC/dsbev_2stage.f
new file mode 100644
index 00000000..771d29e0
--- /dev/null
+++ b/SRC/dsbev_2stage.f
@@ -0,0 +1,377 @@
+*> \brief <b> DSBEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+* @precisions fortran d -> s
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSBEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsbev_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsbev_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsbev_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DSBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+* WORK, LWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, KD, LDAB, LDZ, N, LWORK
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DSBEV_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a real symmetric band matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is DOUBLE PRECISION array, dimension (LDAB, N)
+*> On entry, the upper or lower triangle of the symmetric band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*> On exit, AB is overwritten by values generated during the
+*> reduction to tridiagonal form. If UPLO = 'U', the first
+*> superdiagonal and the diagonal of the tridiagonal matrix T
+*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*> the diagonal and first subdiagonal of T are returned in the
+*> first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is DOUBLE PRECISION array, dimension (LDZ, N)
+*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*> eigenvectors of the matrix A, with the i-th column of Z
+*> holding the eigenvector associated with W(i).
+*> If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is DOUBLE PRECISION array, dimension LWORK
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS + N
+*> where KD is the size of the band.
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, the algorithm failed to converge; i
+*> off-diagonal elements of an intermediate tridiagonal
+*> form did not converge to zero.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup doubleOTHEReigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE DSBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+ $ WORK, LWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, KD, LDAB, LDZ, N, LWORK
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, WANTZ, LQUERY
+ INTEGER IINFO, IMAX, INDE, INDWRK, ISCALE,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS
+ DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, DLANSB
+ EXTERNAL LSAME, DLAMCH, DLANSB, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL DLASCL, DSCAL, DSTEQR, DSTERF, XERBLA
+ $ DSYTRD_SB2ST
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDAB.LT.KD+1 ) THEN
+ INFO = -6
+ ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -9
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ WORK( 1 ) = LWMIN
+ ELSE
+ IB = ILAENV( 18, 'DSYTRD_SB2ST', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+ LWMIN = N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+ ENDIF
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -11
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSBEV_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ W( 1 ) = AB( 1, 1 )
+ ELSE
+ W( 1 ) = AB( KD+1, 1 )
+ END IF
+ IF( WANTZ )
+ $ Z( 1, 1 ) = ONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = DLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ CALL DLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ ELSE
+ CALL DLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ END IF
+ END IF
+*
+* Call DSYTRD_SB2ST to reduce symmetric band matrix to tridiagonal form.
+*
+ INDE = 1
+ INDHOUS = INDE + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL DSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+ $ WORK( INDE ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call DSTERF. For eigenvectors, call SSTEQR.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL DSTERF( N, W, WORK( INDE ), INFO )
+ ELSE
+ CALL DSTEQR( JOBZ, N, W, WORK( INDE ), Z, LDZ, WORK( INDWRK ),
+ $ INFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = N
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of DSBEV_2STAGE
+*
+ END
diff --git a/SRC/dsbevd_2stage.f b/SRC/dsbevd_2stage.f
new file mode 100644
index 00000000..39074681
--- /dev/null
+++ b/SRC/dsbevd_2stage.f
@@ -0,0 +1,412 @@
+*> \brief <b> DSBEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+* @precisions fortran d -> s
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSBEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsbevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsbevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsbevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DSBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+* WORK, LWORK, IWORK, LIWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, KD, LDAB, LDZ, LIWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+* INTEGER IWORK( * )
+* DOUBLE PRECISION AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DSBEVD_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a real symmetric band matrix A using the 2stage technique for
+*> the reduction to tridiagonal. If eigenvectors are desired, it uses
+*> a divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is DOUBLE PRECISION array, dimension (LDAB, N)
+*> On entry, the upper or lower triangle of the symmetric band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*> On exit, AB is overwritten by values generated during the
+*> reduction to tridiagonal form. If UPLO = 'U', the first
+*> superdiagonal and the diagonal of the tridiagonal matrix T
+*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*> the diagonal and first subdiagonal of T are returned in the
+*> first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is DOUBLE PRECISION array, dimension (LDZ, N)
+*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*> eigenvectors of the matrix A, with the i-th column of Z
+*> holding the eigenvector associated with W(i).
+*> If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is DOUBLE PRECISION array, dimension LWORK
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS + N
+*> where KD is the size of the band.
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK and IWORK
+*> arrays, returns these values as the first entries of the WORK
+*> and IWORK arrays, and no error message related to LWORK or
+*> LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*> LIWORK is INTEGER
+*> The dimension of the array IWORK.
+*> If JOBZ = 'N' or N <= 1, LIWORK must be at least 1.
+*> If JOBZ = 'V' and N > 2, LIWORK must be at least 3 + 5*N.
+*>
+*> If LIWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK and IWORK arrays, and no error message related to
+*> LWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, the algorithm failed to converge; i
+*> off-diagonal elements of an intermediate tridiagonal
+*> form did not converge to zero.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup doubleOTHEReigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE DSBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+ $ WORK, LWORK, IWORK, LIWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, KD, LDAB, LDZ, LIWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+ INTEGER IWORK( * )
+ DOUBLE PRECISION AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, LQUERY, WANTZ
+ INTEGER IINFO, INDE, INDWK2, INDWRK, ISCALE, LIWMIN,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS,
+ $ LLWRK2
+ DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, DLANSB
+ EXTERNAL LSAME, DLAMCH, DLANSB, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL DGEMM, DLACPY, DLASCL, DSCAL, DSTEDC,
+ $ DSTERF, XERBLA, DSYTRD_SB2ST
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( N.LE.1 ) THEN
+ LIWMIN = 1
+ LWMIN = 1
+ ELSE
+ IB = ILAENV( 18, 'DSYTRD_SB2ST', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+ IF( WANTZ ) THEN
+ LIWMIN = 3 + 5*N
+ LWMIN = 1 + 5*N + 2*N**2
+ ELSE
+ LIWMIN = 1
+ LWMIN = MAX( 2*N, N+LHTRD+LWTRD )
+ END IF
+ END IF
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDAB.LT.KD+1 ) THEN
+ INFO = -6
+ ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -9
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ WORK( 1 ) = LWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -11
+ ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -13
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSBEVD_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ W( 1 ) = AB( 1, 1 )
+ IF( WANTZ )
+ $ Z( 1, 1 ) = ONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = DLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ CALL DLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ ELSE
+ CALL DLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ END IF
+ END IF
+*
+* Call DSYTRD_SB2ST to reduce band symmetric matrix to tridiagonal form.
+*
+ INDE = 1
+ INDHOUS = INDE + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+ INDWK2 = INDWRK + N*N
+ LLWRK2 = LWORK - INDWK2 + 1
+*
+ CALL DSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+ $ WORK( INDE ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call DSTERF. For eigenvectors, call SSTEDC.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL DSTERF( N, W, WORK( INDE ), INFO )
+ ELSE
+ CALL DSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N,
+ $ WORK( INDWK2 ), LLWRK2, IWORK, LIWORK, INFO )
+ CALL DGEMM( 'N', 'N', N, N, N, ONE, Z, LDZ, WORK( INDWRK ), N,
+ $ ZERO, WORK( INDWK2 ), N )
+ CALL DLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 )
+ $ CALL DSCAL( N, ONE / SIGMA, W, 1 )
+*
+ WORK( 1 ) = LWMIN
+ IWORK( 1 ) = LIWMIN
+ RETURN
+*
+* End of DSBEVD_2STAGE
+*
+ END
diff --git a/SRC/dsbevx_2stage.f b/SRC/dsbevx_2stage.f
new file mode 100644
index 00000000..3cb3f661
--- /dev/null
+++ b/SRC/dsbevx_2stage.f
@@ -0,0 +1,633 @@
+*> \brief <b> DSBEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+* @precisions fortran d -> s
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSBEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsbevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsbevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsbevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DSBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, Q,
+* LDQ, VL, VU, IL, IU, ABSTOL, M, W, Z,
+* LDZ, WORK, LWORK, IWORK, IFAIL, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, RANGE, UPLO
+* INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+* DOUBLE PRECISION ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+* INTEGER IFAIL( * ), IWORK( * )
+* DOUBLE PRECISION AB( LDAB, * ), Q( LDQ, * ), W( * ), WORK( * ),
+* $ Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DSBEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a real symmetric band matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors can
+*> be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*> RANGE is CHARACTER*1
+*> = 'A': all eigenvalues will be found;
+*> = 'V': all eigenvalues in the half-open interval (VL,VU]
+*> will be found;
+*> = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is DOUBLE PRECISION array, dimension (LDAB, N)
+*> On entry, the upper or lower triangle of the symmetric band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*> On exit, AB is overwritten by values generated during the
+*> reduction to tridiagonal form. If UPLO = 'U', the first
+*> superdiagonal and the diagonal of the tridiagonal matrix T
+*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*> the diagonal and first subdiagonal of T are returned in the
+*> first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] Q
+*> \verbatim
+*> Q is DOUBLE PRECISION array, dimension (LDQ, N)
+*> If JOBZ = 'V', the N-by-N orthogonal matrix used in the
+*> reduction to tridiagonal form.
+*> If JOBZ = 'N', the array Q is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDQ
+*> \verbatim
+*> LDQ is INTEGER
+*> The leading dimension of the array Q. If JOBZ = 'V', then
+*> LDQ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*> VL is DOUBLE PRECISION
+*> If RANGE='V', the lower bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*> VU is DOUBLE PRECISION
+*> If RANGE='V', the upper bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*> IL is INTEGER
+*> If RANGE='I', the index of the
+*> smallest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*> IU is INTEGER
+*> If RANGE='I', the index of the
+*> largest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*> ABSTOL is DOUBLE PRECISION
+*> The absolute error tolerance for the eigenvalues.
+*> An approximate eigenvalue is accepted as converged
+*> when it is determined to lie in an interval [a,b]
+*> of width less than or equal to
+*>
+*> ABSTOL + EPS * max( |a|,|b| ) ,
+*>
+*> where EPS is the machine precision. If ABSTOL is less than
+*> or equal to zero, then EPS*|T| will be used in its place,
+*> where |T| is the 1-norm of the tridiagonal matrix obtained
+*> by reducing AB to tridiagonal form.
+*>
+*> Eigenvalues will be computed most accurately when ABSTOL is
+*> set to twice the underflow threshold 2*DLAMCH('S'), not zero.
+*> If this routine returns with INFO>0, indicating that some
+*> eigenvectors did not converge, try setting ABSTOL to
+*> 2*DLAMCH('S').
+*>
+*> See "Computing Small Singular Values of Bidiagonal Matrices
+*> with Guaranteed High Relative Accuracy," by Demmel and
+*> Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*> M is INTEGER
+*> The total number of eigenvalues found. 0 <= M <= N.
+*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> The first M elements contain the selected eigenvalues in
+*> ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is DOUBLE PRECISION array, dimension (LDZ, max(1,M))
+*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*> contain the orthonormal eigenvectors of the matrix A
+*> corresponding to the selected eigenvalues, with the i-th
+*> column of Z holding the eigenvector associated with W(i).
+*> If an eigenvector fails to converge, then that column of Z
+*> contains the latest approximation to the eigenvector, and the
+*> index of the eigenvector is returned in IFAIL.
+*> If JOBZ = 'N', then Z is not referenced.
+*> Note: the user must ensure that at least max(1,M) columns are
+*> supplied in the array Z; if RANGE = 'V', the exact value of M
+*> is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is DOUBLE PRECISION array, dimension (LWORK)
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, 7*N, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS + 2*N
+*> where KD is the size of the band.
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*> IFAIL is INTEGER array, dimension (N)
+*> If JOBZ = 'V', then if INFO = 0, the first M elements of
+*> IFAIL are zero. If INFO > 0, then IFAIL contains the
+*> indices of the eigenvectors that failed to converge.
+*> If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit.
+*> < 0: if INFO = -i, the i-th argument had an illegal value.
+*> > 0: if INFO = i, then i eigenvectors failed to converge.
+*> Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup doubleOTHEReigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE DSBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, Q,
+ $ LDQ, VL, VU, IL, IU, ABSTOL, M, W, Z,
+ $ LDZ, WORK, LWORK, IWORK, IFAIL, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, RANGE, UPLO
+ INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+ DOUBLE PRECISION ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+ INTEGER IFAIL( * ), IWORK( * )
+ DOUBLE PRECISION AB( LDAB, * ), Q( LDQ, * ), W( * ), WORK( * ),
+ $ Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL ALLEIG, INDEIG, LOWER, TEST, VALEIG, WANTZ,
+ $ LQUERY
+ CHARACTER ORDER
+ INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+ $ INDISP, INDIWO, INDWRK, ISCALE, ITMP1, J, JJ,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS,
+ $ NSPLIT
+ DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+ $ SIGMA, SMLNUM, TMP1, VLL, VUU
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, DLANSB
+ EXTERNAL LSAME, DLAMCH, DLANSB, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL DCOPY, DGEMV, DLACPY, DLASCL, DSCAL,
+ $ DSTEBZ, DSTEIN, DSTEQR, DSTERF, DSWAP, XERBLA,
+ $ DSYTRD_SB2ST
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, MIN, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ ALLEIG = LSAME( RANGE, 'A' )
+ VALEIG = LSAME( RANGE, 'V' )
+ INDEIG = LSAME( RANGE, 'I' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -5
+ ELSE IF( LDAB.LT.KD+1 ) THEN
+ INFO = -7
+ ELSE IF( WANTZ .AND. LDQ.LT.MAX( 1, N ) ) THEN
+ INFO = -9
+ ELSE
+ IF( VALEIG ) THEN
+ IF( N.GT.0 .AND. VU.LE.VL )
+ $ INFO = -11
+ ELSE IF( INDEIG ) THEN
+ IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+ INFO = -12
+ ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+ INFO = -13
+ END IF
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) )
+ $ INFO = -18
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ WORK( 1 ) = LWMIN
+ ELSE
+ IB = ILAENV( 18, 'DSYTRD_SB2ST', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+ LWMIN = 2*N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+ ENDIF
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -20
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSBEVX_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ M = 0
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ M = 1
+ IF( LOWER ) THEN
+ TMP1 = AB( 1, 1 )
+ ELSE
+ TMP1 = AB( KD+1, 1 )
+ END IF
+ IF( VALEIG ) THEN
+ IF( .NOT.( VL.LT.TMP1 .AND. VU.GE.TMP1 ) )
+ $ M = 0
+ END IF
+ IF( M.EQ.1 ) THEN
+ W( 1 ) = TMP1
+ IF( WANTZ )
+ $ Z( 1, 1 ) = ONE
+ END IF
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ISCALE = 0
+ ABSTLL = ABSTOL
+ IF( VALEIG ) THEN
+ VLL = VL
+ VUU = VU
+ ELSE
+ VLL = ZERO
+ VUU = ZERO
+ END IF
+ ANRM = DLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK )
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ CALL DLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ ELSE
+ CALL DLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ END IF
+ IF( ABSTOL.GT.0 )
+ $ ABSTLL = ABSTOL*SIGMA
+ IF( VALEIG ) THEN
+ VLL = VL*SIGMA
+ VUU = VU*SIGMA
+ END IF
+ END IF
+*
+* Call DSYTRD_SB2ST to reduce symmetric band matrix to tridiagonal form.
+*
+ INDD = 1
+ INDE = INDD + N
+ INDHOUS = INDE + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL DSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, WORK( INDD ),
+ $ WORK( INDE ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* If all eigenvalues are desired and ABSTOL is less than or equal
+* to zero, then call DSTERF or SSTEQR. If this fails for some
+* eigenvalue, then try DSTEBZ.
+*
+ TEST = .FALSE.
+ IF (INDEIG) THEN
+ IF (IL.EQ.1 .AND. IU.EQ.N) THEN
+ TEST = .TRUE.
+ END IF
+ END IF
+ IF ((ALLEIG .OR. TEST) .AND. (ABSTOL.LE.ZERO)) THEN
+ CALL DCOPY( N, WORK( INDD ), 1, W, 1 )
+ INDEE = INDWRK + 2*N
+ IF( .NOT.WANTZ ) THEN
+ CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+ CALL DSTERF( N, W, WORK( INDEE ), INFO )
+ ELSE
+ CALL DLACPY( 'A', N, N, Q, LDQ, Z, LDZ )
+ CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+ CALL DSTEQR( JOBZ, N, W, WORK( INDEE ), Z, LDZ,
+ $ WORK( INDWRK ), INFO )
+ IF( INFO.EQ.0 ) THEN
+ DO 10 I = 1, N
+ IFAIL( I ) = 0
+ 10 CONTINUE
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ M = N
+ GO TO 30
+ END IF
+ INFO = 0
+ END IF
+*
+* Otherwise, call DSTEBZ and, if eigenvectors are desired, SSTEIN.
+*
+ IF( WANTZ ) THEN
+ ORDER = 'B'
+ ELSE
+ ORDER = 'E'
+ END IF
+ INDIBL = 1
+ INDISP = INDIBL + N
+ INDIWO = INDISP + N
+ CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+ $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ),
+ $ IWORK( INDIWO ), INFO )
+*
+ IF( WANTZ ) THEN
+ CALL DSTEIN( N, WORK( INDD ), WORK( INDE ), M, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+ $ WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO )
+*
+* Apply orthogonal matrix used in reduction to tridiagonal
+* form to eigenvectors returned by DSTEIN.
+*
+ DO 20 J = 1, M
+ CALL DCOPY( N, Z( 1, J ), 1, WORK( 1 ), 1 )
+ CALL DGEMV( 'N', N, N, ONE, Q, LDQ, WORK, 1, ZERO,
+ $ Z( 1, J ), 1 )
+ 20 CONTINUE
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ 30 CONTINUE
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = M
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* If eigenvalues are not in order, then sort them, along with
+* eigenvectors.
+*
+ IF( WANTZ ) THEN
+ DO 50 J = 1, M - 1
+ I = 0
+ TMP1 = W( J )
+ DO 40 JJ = J + 1, M
+ IF( W( JJ ).LT.TMP1 ) THEN
+ I = JJ
+ TMP1 = W( JJ )
+ END IF
+ 40 CONTINUE
+*
+ IF( I.NE.0 ) THEN
+ ITMP1 = IWORK( INDIBL+I-1 )
+ W( I ) = W( J )
+ IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+ W( J ) = TMP1
+ IWORK( INDIBL+J-1 ) = ITMP1
+ CALL DSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+ IF( INFO.NE.0 ) THEN
+ ITMP1 = IFAIL( I )
+ IFAIL( I ) = IFAIL( J )
+ IFAIL( J ) = ITMP1
+ END IF
+ END IF
+ 50 CONTINUE
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of DSBEVX_2STAGE
+*
+ END
diff --git a/SRC/dsyev_2stage.f b/SRC/dsyev_2stage.f
new file mode 100644
index 00000000..a42e86d8
--- /dev/null
+++ b/SRC/dsyev_2stage.f
@@ -0,0 +1,348 @@
+*> \brief <b> DSYEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+* @precisions fortran d -> s
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSYEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsyevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsyevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsyevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+* INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, LDA, LWORK, N
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DSYEV_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is DOUBLE PRECISION array, dimension (LDA, N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*> orthonormal eigenvectors of the matrix A.
+*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*> or the upper triangle (if UPLO='U') of A, including the
+*> diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is DOUBLE PRECISION array, dimension LWORK
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + 2*N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + 2*N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, the algorithm failed to converge; i
+*> off-diagonal elements of an intermediate tridiagonal
+*> form did not converge to zero.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup doubleSYeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+ $ INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, LDA, LWORK, N
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, LQUERY, WANTZ
+ INTEGER IINFO, IMAX, INDE, INDTAU, INDWRK, ISCALE,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+ DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, DLANSY
+ EXTERNAL LSAME, ILAENV, DLAMCH, DLANSY
+* ..
+* .. External Subroutines ..
+ EXTERNAL DLASCL, DORGTR, DSCAL, DSTEQR, DSTERF,
+ $ XERBLA, DSYTRD_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = 2*N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -8
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSYEV_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 ) THEN
+ RETURN
+ END IF
+*
+ IF( N.EQ.1 ) THEN
+ W( 1 ) = A( 1, 1 )
+ WORK( 1 ) = 2
+ IF( WANTZ )
+ $ A( 1, 1 ) = ONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = DLANSY( 'M', UPLO, N, A, LDA, WORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 )
+ $ CALL DLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+* Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+ INDE = 1
+ INDTAU = INDE + N
+ INDHOUS = INDTAU + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
+ $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call DSTERF. For eigenvectors, first call
+* DORGTR to generate the orthogonal matrix, then call DSTEQR.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL DSTERF( N, W, WORK( INDE ), INFO )
+ ELSE
+* Not available in this release, and agrument checking should not
+* let it getting here
+ RETURN
+ CALL DORGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ),
+ $ LLWORK, IINFO )
+ CALL DSTEQR( JOBZ, N, W, WORK( INDE ), A, LDA, WORK( INDTAU ),
+ $ INFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = N
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of DSYEV_2STAGE
+*
+ END
diff --git a/SRC/dsyevd_2stage.f b/SRC/dsyevd_2stage.f
new file mode 100644
index 00000000..161f0e9e
--- /dev/null
+++ b/SRC/dsyevd_2stage.f
@@ -0,0 +1,406 @@
+*> \brief <b> DSYEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+* @precisions fortran d -> s
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSYEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsyevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsyevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsyevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DSYEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+* IWORK, LIWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, LDA, LIWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+* INTEGER IWORK( * )
+* DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DSYEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal. If eigenvectors are desired, it uses a
+*> divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is DOUBLE PRECISION array, dimension (LDA, N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*> orthonormal eigenvectors of the matrix A.
+*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*> or the upper triangle (if UPLO='U') of A, including the
+*> diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is DOUBLE PRECISION array,
+*> dimension (LWORK)
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK.
+*> If N <= 1, LWORK must be at least 1.
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + 2*N+1
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + 2*N+1
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be at least
+*> 1 + 6*N + 2*N**2.
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK and IWORK
+*> arrays, returns these values as the first entries of the WORK
+*> and IWORK arrays, and no error message related to LWORK or
+*> LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*> LIWORK is INTEGER
+*> The dimension of the array IWORK.
+*> If N <= 1, LIWORK must be at least 1.
+*> If JOBZ = 'N' and N > 1, LIWORK must be at least 1.
+*> If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
+*>
+*> If LIWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK and IWORK arrays, and no error message related to
+*> LWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i and JOBZ = 'N', then the algorithm failed
+*> to converge; i off-diagonal elements of an intermediate
+*> tridiagonal form did not converge to zero;
+*> if INFO = i and JOBZ = 'V', then the algorithm failed
+*> to compute an eigenvalue while working on the submatrix
+*> lying in rows and columns INFO/(N+1) through
+*> mod(INFO,N+1).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date September 2012
+*
+*> \ingroup doubleSYeigen
+*
+*> \par Contributors:
+* ==================
+*>
+*> Jeff Rutter, Computer Science Division, University of California
+*> at Berkeley, USA \n
+*> Modified by Francoise Tisseur, University of Tennessee \n
+*> Modified description of INFO. Sven, 16 Feb 05. \n
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE DSYEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+ $ IWORK, LIWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.4.2) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* September 2012
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, LDA, LIWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+ INTEGER IWORK( * )
+ DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+* ..
+* .. Local Scalars ..
+*
+ LOGICAL LOWER, LQUERY, WANTZ
+ INTEGER IINFO, INDE, INDTAU, INDWK2, INDWRK, ISCALE,
+ $ LIWMIN, LLWORK, LLWRK2, LWMIN,
+ $ LHTRD, LWTRD, KD, IB, INDHOUS
+ DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, DLANSY
+ EXTERNAL LSAME, DLAMCH, DLANSY, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL DLACPY, DLASCL, DORMTR, DSCAL, DSTEDC, DSTERF,
+ $ DSYTRD_2STAGE, XERBLA
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LIWMIN = 1
+ LWMIN = 1
+ ELSE
+ KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ IF( WANTZ ) THEN
+ LIWMIN = 3 + 5*N
+ LWMIN = 1 + 6*N + 2*N**2
+ ELSE
+ LIWMIN = 1
+ LWMIN = 2*N + 1 + LHTRD + LWTRD
+ END IF
+ END IF
+ WORK( 1 ) = LWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -8
+ ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -10
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSYEVD_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ W( 1 ) = A( 1, 1 )
+ IF( WANTZ )
+ $ A( 1, 1 ) = ONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = DLANSY( 'M', UPLO, N, A, LDA, WORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 )
+ $ CALL DLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+* Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+ INDE = 1
+ INDTAU = INDE + N
+ INDHOUS = INDTAU + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+ INDWK2 = INDWRK + N*N
+ LLWRK2 = LWORK - INDWK2 + 1
+*
+ CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
+ $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call DSTERF. For eigenvectors, first call
+* DSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
+* tridiagonal matrix, then call DORMTR to multiply it by the
+* Householder transformations stored in A.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL DSTERF( N, W, WORK( INDE ), INFO )
+ ELSE
+* Not available in this release, and agrument checking should not
+* let it getting here
+ RETURN
+ CALL DSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N,
+ $ WORK( INDWK2 ), LLWRK2, IWORK, LIWORK, INFO )
+ CALL DORMTR( 'L', UPLO, 'N', N, N, A, LDA, WORK( INDTAU ),
+ $ WORK( INDWRK ), N, WORK( INDWK2 ), LLWRK2, IINFO )
+ CALL DLACPY( 'A', N, N, WORK( INDWRK ), N, A, LDA )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 )
+ $ CALL DSCAL( N, ONE / SIGMA, W, 1 )
+*
+ WORK( 1 ) = LWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ RETURN
+*
+* End of DSYEVD_2STAGE
+*
+ END
diff --git a/SRC/dsyevr_2stage.f b/SRC/dsyevr_2stage.f
new file mode 100644
index 00000000..c1b468dc
--- /dev/null
+++ b/SRC/dsyevr_2stage.f
@@ -0,0 +1,740 @@
+*> \brief <b> DSYEVR_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+* @precisions fortran d -> s
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSYEVR_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsyevr_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsyevr_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsyevr_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DSYEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+* IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, WORK,
+* LWORK, IWORK, LIWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, RANGE, UPLO
+* INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LWORK, M, N
+* DOUBLE PRECISION ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+* INTEGER ISUPPZ( * ), IWORK( * )
+* DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DSYEVR_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors can be
+*> selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*>
+*> DSYEVR_2STAGE first reduces the matrix A to tridiagonal form T with a call
+*> to DSYTRD. Then, whenever possible, DSYEVR_2STAGE calls DSTEMR to compute
+*> the eigenspectrum using Relatively Robust Representations. DSTEMR
+*> computes eigenvalues by the dqds algorithm, while orthogonal
+*> eigenvectors are computed from various "good" L D L^T representations
+*> (also known as Relatively Robust Representations). Gram-Schmidt
+*> orthogonalization is avoided as far as possible. More specifically,
+*> the various steps of the algorithm are as follows.
+*>
+*> For each unreduced block (submatrix) of T,
+*> (a) Compute T - sigma I = L D L^T, so that L and D
+*> define all the wanted eigenvalues to high relative accuracy.
+*> This means that small relative changes in the entries of D and L
+*> cause only small relative changes in the eigenvalues and
+*> eigenvectors. The standard (unfactored) representation of the
+*> tridiagonal matrix T does not have this property in general.
+*> (b) Compute the eigenvalues to suitable accuracy.
+*> If the eigenvectors are desired, the algorithm attains full
+*> accuracy of the computed eigenvalues only right before
+*> the corresponding vectors have to be computed, see steps c) and d).
+*> (c) For each cluster of close eigenvalues, select a new
+*> shift close to the cluster, find a new factorization, and refine
+*> the shifted eigenvalues to suitable accuracy.
+*> (d) For each eigenvalue with a large enough relative separation compute
+*> the corresponding eigenvector by forming a rank revealing twisted
+*> factorization. Go back to (c) for any clusters that remain.
+*>
+*> The desired accuracy of the output can be specified by the input
+*> parameter ABSTOL.
+*>
+*> For more details, see DSTEMR's documentation and:
+*> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+*> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+*> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+*> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+*> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+*> 2004. Also LAPACK Working Note 154.
+*> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+*> tridiagonal eigenvalue/eigenvector problem",
+*> Computer Science Division Technical Report No. UCB/CSD-97-971,
+*> UC Berkeley, May 1997.
+*>
+*>
+*> Note 1 : DSYEVR_2STAGE calls DSTEMR when the full spectrum is requested
+*> on machines which conform to the ieee-754 floating point standard.
+*> DSYEVR_2STAGE calls DSTEBZ and SSTEIN on non-ieee machines and
+*> when partial spectrum requests are made.
+*>
+*> Normal execution of DSTEMR may create NaNs and infinities and
+*> hence may abort due to a floating point exception in environments
+*> which do not handle NaNs and infinities in the ieee standard default
+*> manner.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*> RANGE is CHARACTER*1
+*> = 'A': all eigenvalues will be found.
+*> = 'V': all eigenvalues in the half-open interval (VL,VU]
+*> will be found.
+*> = 'I': the IL-th through IU-th eigenvalues will be found.
+*> For RANGE = 'V' or 'I' and IU - IL < N - 1, DSTEBZ and
+*> DSTEIN are called
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is DOUBLE PRECISION array, dimension (LDA, N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, the lower triangle (if UPLO='L') or the upper
+*> triangle (if UPLO='U') of A, including the diagonal, is
+*> destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*> VL is DOUBLE PRECISION
+*> If RANGE='V', the lower bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*> VU is DOUBLE PRECISION
+*> If RANGE='V', the upper bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*> IL is INTEGER
+*> If RANGE='I', the index of the
+*> smallest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*> IU is INTEGER
+*> If RANGE='I', the index of the
+*> largest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*> ABSTOL is DOUBLE PRECISION
+*> The absolute error tolerance for the eigenvalues.
+*> An approximate eigenvalue is accepted as converged
+*> when it is determined to lie in an interval [a,b]
+*> of width less than or equal to
+*>
+*> ABSTOL + EPS * max( |a|,|b| ) ,
+*>
+*> where EPS is the machine precision. If ABSTOL is less than
+*> or equal to zero, then EPS*|T| will be used in its place,
+*> where |T| is the 1-norm of the tridiagonal matrix obtained
+*> by reducing A to tridiagonal form.
+*>
+*> See "Computing Small Singular Values of Bidiagonal Matrices
+*> with Guaranteed High Relative Accuracy," by Demmel and
+*> Kahan, LAPACK Working Note #3.
+*>
+*> If high relative accuracy is important, set ABSTOL to
+*> DLAMCH( 'Safe minimum' ). Doing so will guarantee that
+*> eigenvalues are computed to high relative accuracy when
+*> possible in future releases. The current code does not
+*> make any guarantees about high relative accuracy, but
+*> future releases will. See J. Barlow and J. Demmel,
+*> "Computing Accurate Eigensystems of Scaled Diagonally
+*> Dominant Matrices", LAPACK Working Note #7, for a discussion
+*> of which matrices define their eigenvalues to high relative
+*> accuracy.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*> M is INTEGER
+*> The total number of eigenvalues found. 0 <= M <= N.
+*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> The first M elements contain the selected eigenvalues in
+*> ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is DOUBLE PRECISION array, dimension (LDZ, max(1,M))
+*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*> contain the orthonormal eigenvectors of the matrix A
+*> corresponding to the selected eigenvalues, with the i-th
+*> column of Z holding the eigenvector associated with W(i).
+*> If JOBZ = 'N', then Z is not referenced.
+*> Note: the user must ensure that at least max(1,M) columns are
+*> supplied in the array Z; if RANGE = 'V', the exact value of M
+*> is not known in advance and an upper bound must be used.
+*> Supplying N columns is always safe.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] ISUPPZ
+*> \verbatim
+*> ISUPPZ is INTEGER array, dimension ( 2*max(1,M) )
+*> The support of the eigenvectors in Z, i.e., the indices
+*> indicating the nonzero elements in Z. The i-th eigenvector
+*> is nonzero only in elements ISUPPZ( 2*i-1 ) through
+*> ISUPPZ( 2*i ). This is an output of DSTEMR (tridiagonal
+*> matrix). The support of the eigenvectors of A is typically
+*> 1:N because of the orthogonal transformations applied by DORMTR.
+*> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK.
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, 26*N, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + 5*N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + 5*N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*> LIWORK is INTEGER
+*> The dimension of the array IWORK. LIWORK >= max(1,10*N).
+*>
+*> If LIWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal size of the IWORK array,
+*> returns this value as the first entry of the IWORK array, and
+*> no error message related to LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: Internal error
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup doubleSYeigen
+*
+*> \par Contributors:
+* ==================
+*>
+*> Inderjit Dhillon, IBM Almaden, USA \n
+*> Osni Marques, LBNL/NERSC, USA \n
+*> Ken Stanley, Computer Science Division, University of
+*> California at Berkeley, USA \n
+*> Jason Riedy, Computer Science Division, University of
+*> California at Berkeley, USA \n
+*>
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE DSYEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+ $ IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, WORK,
+ $ LWORK, IWORK, LIWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, RANGE, UPLO
+ INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LWORK, M, N
+ DOUBLE PRECISION ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+ INTEGER ISUPPZ( * ), IWORK( * )
+ DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE, TWO
+ PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, VALEIG, WANTZ,
+ $ TRYRAC
+ CHARACTER ORDER
+ INTEGER I, IEEEOK, IINFO, IMAX, INDD, INDDD, INDE,
+ $ INDEE, INDIBL, INDIFL, INDISP, INDIWO, INDTAU,
+ $ INDWK, INDWKN, ISCALE, J, JJ, LIWMIN,
+ $ LLWORK, LLWRKN, LWMIN, NSPLIT,
+ $ LHTRD, LWTRD, KD, IB, INDHOUS
+ DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+ $ SIGMA, SMLNUM, TMP1, VLL, VUU
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, DLANSY
+ EXTERNAL LSAME, ILAENV, DLAMCH, DLANSY
+* ..
+* .. External Subroutines ..
+ EXTERNAL DCOPY, DORMTR, DSCAL, DSTEBZ, DSTEMR, DSTEIN,
+ $ DSTERF, DSWAP, DSYTRD_2STAGE, XERBLA
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, MIN, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ IEEEOK = ILAENV( 10, 'DSYEVR', 'N', 1, 2, 3, 4 )
+*
+ LOWER = LSAME( UPLO, 'L' )
+ WANTZ = LSAME( JOBZ, 'V' )
+ ALLEIG = LSAME( RANGE, 'A' )
+ VALEIG = LSAME( RANGE, 'V' )
+ INDEIG = LSAME( RANGE, 'I' )
+*
+ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) )
+*
+ KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD )
+ LIWMIN = MAX( 1, 10*N )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -6
+ ELSE
+ IF( VALEIG ) THEN
+ IF( N.GT.0 .AND. VU.LE.VL )
+ $ INFO = -8
+ ELSE IF( INDEIG ) THEN
+ IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+ INFO = -9
+ ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+ INFO = -10
+ END IF
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -15
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -18
+ ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -20
+ END IF
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+* NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 )
+* NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) )
+* LWKOPT = MAX( ( NB+1 )*N, LWMIN )
+ WORK( 1 ) = LWMIN
+ IWORK( 1 ) = LIWMIN
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSYEVR_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ M = 0
+ IF( N.EQ.0 ) THEN
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+ IF( N.EQ.1 ) THEN
+ WORK( 1 ) = 7
+ IF( ALLEIG .OR. INDEIG ) THEN
+ M = 1
+ W( 1 ) = A( 1, 1 )
+ ELSE
+ IF( VL.LT.A( 1, 1 ) .AND. VU.GE.A( 1, 1 ) ) THEN
+ M = 1
+ W( 1 ) = A( 1, 1 )
+ END IF
+ END IF
+ IF( WANTZ ) THEN
+ Z( 1, 1 ) = ONE
+ ISUPPZ( 1 ) = 1
+ ISUPPZ( 2 ) = 1
+ END IF
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ISCALE = 0
+ ABSTLL = ABSTOL
+ IF (VALEIG) THEN
+ VLL = VL
+ VUU = VU
+ END IF
+ ANRM = DLANSY( 'M', UPLO, N, A, LDA, WORK )
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ DO 10 J = 1, N
+ CALL DSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+ 10 CONTINUE
+ ELSE
+ DO 20 J = 1, N
+ CALL DSCAL( J, SIGMA, A( 1, J ), 1 )
+ 20 CONTINUE
+ END IF
+ IF( ABSTOL.GT.0 )
+ $ ABSTLL = ABSTOL*SIGMA
+ IF( VALEIG ) THEN
+ VLL = VL*SIGMA
+ VUU = VU*SIGMA
+ END IF
+ END IF
+
+* Initialize indices into workspaces. Note: The IWORK indices are
+* used only if DSTERF or DSTEMR fail.
+
+* WORK(INDTAU:INDTAU+N-1) stores the scalar factors of the
+* elementary reflectors used in DSYTRD.
+ INDTAU = 1
+* WORK(INDD:INDD+N-1) stores the tridiagonal's diagonal entries.
+ INDD = INDTAU + N
+* WORK(INDE:INDE+N-1) stores the off-diagonal entries of the
+* tridiagonal matrix from DSYTRD.
+ INDE = INDD + N
+* WORK(INDDD:INDDD+N-1) is a copy of the diagonal entries over
+* -written by DSTEMR (the DSTERF path copies the diagonal to W).
+ INDDD = INDE + N
+* WORK(INDEE:INDEE+N-1) is a copy of the off-diagonal entries over
+* -written while computing the eigenvalues in DSTERF and DSTEMR.
+ INDEE = INDDD + N
+* INDHOUS is the starting offset Householder storage of stage 2
+ INDHOUS = INDEE + N
+* INDWK is the starting offset of the left-over workspace, and
+* LLWORK is the remaining workspace size.
+ INDWK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWK + 1
+
+
+* IWORK(INDIBL:INDIBL+M-1) corresponds to IBLOCK in DSTEBZ and
+* stores the block indices of each of the M<=N eigenvalues.
+ INDIBL = 1
+* IWORK(INDISP:INDISP+NSPLIT-1) corresponds to ISPLIT in DSTEBZ and
+* stores the starting and finishing indices of each block.
+ INDISP = INDIBL + N
+* IWORK(INDIFL:INDIFL+N-1) stores the indices of eigenvectors
+* that corresponding to eigenvectors that fail to converge in
+* DSTEIN. This information is discarded; if any fail, the driver
+* returns INFO > 0.
+ INDIFL = INDISP + N
+* INDIWO is the offset of the remaining integer workspace.
+ INDIWO = INDIFL + N
+
+*
+* Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+*
+ CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ),
+ $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
+ $ LHTRD, WORK( INDWK ), LLWORK, IINFO )
+*
+* If all eigenvalues are desired
+* then call DSTERF or DSTEMR and DORMTR.
+*
+ IF( ( ALLEIG .OR. ( INDEIG .AND. IL.EQ.1 .AND. IU.EQ.N ) ) .AND.
+ $ IEEEOK.EQ.1 ) THEN
+ IF( .NOT.WANTZ ) THEN
+ CALL DCOPY( N, WORK( INDD ), 1, W, 1 )
+ CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+ CALL DSTERF( N, W, WORK( INDEE ), INFO )
+ ELSE
+ CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+ CALL DCOPY( N, WORK( INDD ), 1, WORK( INDDD ), 1 )
+*
+ IF (ABSTOL .LE. TWO*N*EPS) THEN
+ TRYRAC = .TRUE.
+ ELSE
+ TRYRAC = .FALSE.
+ END IF
+ CALL DSTEMR( JOBZ, 'A', N, WORK( INDDD ), WORK( INDEE ),
+ $ VL, VU, IL, IU, M, W, Z, LDZ, N, ISUPPZ,
+ $ TRYRAC, WORK( INDWK ), LWORK, IWORK, LIWORK,
+ $ INFO )
+*
+*
+*
+* Apply orthogonal matrix used in reduction to tridiagonal
+* form to eigenvectors returned by DSTEMR.
+*
+ IF( WANTZ .AND. INFO.EQ.0 ) THEN
+ INDWKN = INDE
+ LLWRKN = LWORK - INDWKN + 1
+ CALL DORMTR( 'L', UPLO, 'N', N, M, A, LDA,
+ $ WORK( INDTAU ), Z, LDZ, WORK( INDWKN ),
+ $ LLWRKN, IINFO )
+ END IF
+ END IF
+*
+*
+ IF( INFO.EQ.0 ) THEN
+* Everything worked. Skip DSTEBZ/DSTEIN. IWORK(:) are
+* undefined.
+ M = N
+ GO TO 30
+ END IF
+ INFO = 0
+ END IF
+*
+* Otherwise, call DSTEBZ and, if eigenvectors are desired, DSTEIN.
+* Also call DSTEBZ and DSTEIN if DSTEMR fails.
+*
+ IF( WANTZ ) THEN
+ ORDER = 'B'
+ ELSE
+ ORDER = 'E'
+ END IF
+
+ CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+ $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWK ),
+ $ IWORK( INDIWO ), INFO )
+*
+ IF( WANTZ ) THEN
+ CALL DSTEIN( N, WORK( INDD ), WORK( INDE ), M, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+ $ WORK( INDWK ), IWORK( INDIWO ), IWORK( INDIFL ),
+ $ INFO )
+*
+* Apply orthogonal matrix used in reduction to tridiagonal
+* form to eigenvectors returned by DSTEIN.
+*
+ INDWKN = INDE
+ LLWRKN = LWORK - INDWKN + 1
+ CALL DORMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+ $ LDZ, WORK( INDWKN ), LLWRKN, IINFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+* Jump here if DSTEMR/DSTEIN succeeded.
+ 30 CONTINUE
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = M
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* If eigenvalues are not in order, then sort them, along with
+* eigenvectors. Note: We do not sort the IFAIL portion of IWORK.
+* It may not be initialized (if DSTEMR/DSTEIN succeeded), and we do
+* not return this detailed information to the user.
+*
+ IF( WANTZ ) THEN
+ DO 50 J = 1, M - 1
+ I = 0
+ TMP1 = W( J )
+ DO 40 JJ = J + 1, M
+ IF( W( JJ ).LT.TMP1 ) THEN
+ I = JJ
+ TMP1 = W( JJ )
+ END IF
+ 40 CONTINUE
+*
+ IF( I.NE.0 ) THEN
+ W( I ) = W( J )
+ W( J ) = TMP1
+ CALL DSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+ END IF
+ 50 CONTINUE
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ RETURN
+*
+* End of DSYEVR_2STAGE
+*
+ END
diff --git a/SRC/dsyevx_2stage.f b/SRC/dsyevx_2stage.f
new file mode 100644
index 00000000..2c52e9e3
--- /dev/null
+++ b/SRC/dsyevx_2stage.f
@@ -0,0 +1,608 @@
+*> \brief <b> DSYEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+* @precisions fortran d -> s
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSYEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsyevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsyevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsyevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DSYEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+* IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+* LWORK, IWORK, IFAIL, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, RANGE, UPLO
+* INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N
+* DOUBLE PRECISION ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+* INTEGER IFAIL( * ), IWORK( * )
+* DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DSYEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors can be
+*> selected by specifying either a range of values or a range of indices
+*> for the desired eigenvalues.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*> RANGE is CHARACTER*1
+*> = 'A': all eigenvalues will be found.
+*> = 'V': all eigenvalues in the half-open interval (VL,VU]
+*> will be found.
+*> = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is DOUBLE PRECISION array, dimension (LDA, N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, the lower triangle (if UPLO='L') or the upper
+*> triangle (if UPLO='U') of A, including the diagonal, is
+*> destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*> VL is DOUBLE PRECISION
+*> If RANGE='V', the lower bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*> VU is DOUBLE PRECISION
+*> If RANGE='V', the upper bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*> IL is INTEGER
+*> If RANGE='I', the index of the
+*> smallest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*> IU is INTEGER
+*> If RANGE='I', the index of the
+*> largest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*> ABSTOL is DOUBLE PRECISION
+*> The absolute error tolerance for the eigenvalues.
+*> An approximate eigenvalue is accepted as converged
+*> when it is determined to lie in an interval [a,b]
+*> of width less than or equal to
+*>
+*> ABSTOL + EPS * max( |a|,|b| ) ,
+*>
+*> where EPS is the machine precision. If ABSTOL is less than
+*> or equal to zero, then EPS*|T| will be used in its place,
+*> where |T| is the 1-norm of the tridiagonal matrix obtained
+*> by reducing A to tridiagonal form.
+*>
+*> Eigenvalues will be computed most accurately when ABSTOL is
+*> set to twice the underflow threshold 2*DLAMCH('S'), not zero.
+*> If this routine returns with INFO>0, indicating that some
+*> eigenvectors did not converge, try setting ABSTOL to
+*> 2*DLAMCH('S').
+*>
+*> See "Computing Small Singular Values of Bidiagonal Matrices
+*> with Guaranteed High Relative Accuracy," by Demmel and
+*> Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*> M is INTEGER
+*> The total number of eigenvalues found. 0 <= M <= N.
+*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> On normal exit, the first M elements contain the selected
+*> eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is DOUBLE PRECISION array, dimension (LDZ, max(1,M))
+*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*> contain the orthonormal eigenvectors of the matrix A
+*> corresponding to the selected eigenvalues, with the i-th
+*> column of Z holding the eigenvector associated with W(i).
+*> If an eigenvector fails to converge, then that column of Z
+*> contains the latest approximation to the eigenvector, and the
+*> index of the eigenvector is returned in IFAIL.
+*> If JOBZ = 'N', then Z is not referenced.
+*> Note: the user must ensure that at least max(1,M) columns are
+*> supplied in the array Z; if RANGE = 'V', the exact value of M
+*> is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, 8*N, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + 3*N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + 3*N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*> IFAIL is INTEGER array, dimension (N)
+*> If JOBZ = 'V', then if INFO = 0, the first M elements of
+*> IFAIL are zero. If INFO > 0, then IFAIL contains the
+*> indices of the eigenvectors that failed to converge.
+*> If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, then i eigenvectors failed to converge.
+*> Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup doubleSYeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE DSYEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+ $ IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+ $ LWORK, IWORK, IFAIL, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, RANGE, UPLO
+ INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N
+ DOUBLE PRECISION ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+ INTEGER IFAIL( * ), IWORK( * )
+ DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG,
+ $ WANTZ
+ CHARACTER ORDER
+ INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+ $ INDISP, INDIWO, INDTAU, INDWKN, INDWRK, ISCALE,
+ $ ITMP1, J, JJ, LLWORK, LLWRKN,
+ $ NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+ DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+ $ SIGMA, SMLNUM, TMP1, VLL, VUU
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, DLANSY
+ EXTERNAL LSAME, ILAENV, DLAMCH, DLANSY
+* ..
+* .. External Subroutines ..
+ EXTERNAL DCOPY, DLACPY, DORGTR, DORMTR, DSCAL, DSTEBZ,
+ $ DSTEIN, DSTEQR, DSTERF, DSWAP, XERBLA,
+ $ DSYTRD_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, MIN, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ LOWER = LSAME( UPLO, 'L' )
+ WANTZ = LSAME( JOBZ, 'V' )
+ ALLEIG = LSAME( RANGE, 'A' )
+ VALEIG = LSAME( RANGE, 'V' )
+ INDEIG = LSAME( RANGE, 'I' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -6
+ ELSE
+ IF( VALEIG ) THEN
+ IF( N.GT.0 .AND. VU.LE.VL )
+ $ INFO = -8
+ ELSE IF( INDEIG ) THEN
+ IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+ INFO = -9
+ ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+ INFO = -10
+ END IF
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -15
+ END IF
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ WORK( 1 ) = LWMIN
+ ELSE
+ KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = MAX( 8*N, 3*N + LHTRD + LWTRD )
+ WORK( 1 ) = LWMIN
+ END IF
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -17
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSYEVX_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ M = 0
+ IF( N.EQ.0 ) THEN
+ RETURN
+ END IF
+*
+ IF( N.EQ.1 ) THEN
+ IF( ALLEIG .OR. INDEIG ) THEN
+ M = 1
+ W( 1 ) = A( 1, 1 )
+ ELSE
+ IF( VL.LT.A( 1, 1 ) .AND. VU.GE.A( 1, 1 ) ) THEN
+ M = 1
+ W( 1 ) = A( 1, 1 )
+ END IF
+ END IF
+ IF( WANTZ )
+ $ Z( 1, 1 ) = ONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ISCALE = 0
+ ABSTLL = ABSTOL
+ IF( VALEIG ) THEN
+ VLL = VL
+ VUU = VU
+ END IF
+ ANRM = DLANSY( 'M', UPLO, N, A, LDA, WORK )
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ DO 10 J = 1, N
+ CALL DSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+ 10 CONTINUE
+ ELSE
+ DO 20 J = 1, N
+ CALL DSCAL( J, SIGMA, A( 1, J ), 1 )
+ 20 CONTINUE
+ END IF
+ IF( ABSTOL.GT.0 )
+ $ ABSTLL = ABSTOL*SIGMA
+ IF( VALEIG ) THEN
+ VLL = VL*SIGMA
+ VUU = VU*SIGMA
+ END IF
+ END IF
+*
+* Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+ INDTAU = 1
+ INDE = INDTAU + N
+ INDD = INDE + N
+ INDHOUS = INDD + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ),
+ $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
+ $ LHTRD, WORK( INDWRK ), LLWORK, IINFO )
+*
+* If all eigenvalues are desired and ABSTOL is less than or equal to
+* zero, then call DSTERF or DORGTR and SSTEQR. If this fails for
+* some eigenvalue, then try DSTEBZ.
+*
+ TEST = .FALSE.
+ IF( INDEIG ) THEN
+ IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+ TEST = .TRUE.
+ END IF
+ END IF
+ IF( ( ALLEIG .OR. TEST ) .AND. ( ABSTOL.LE.ZERO ) ) THEN
+ CALL DCOPY( N, WORK( INDD ), 1, W, 1 )
+ INDEE = INDWRK + 2*N
+ IF( .NOT.WANTZ ) THEN
+ CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+ CALL DSTERF( N, W, WORK( INDEE ), INFO )
+ ELSE
+ CALL DLACPY( 'A', N, N, A, LDA, Z, LDZ )
+ CALL DORGTR( UPLO, N, Z, LDZ, WORK( INDTAU ),
+ $ WORK( INDWRK ), LLWORK, IINFO )
+ CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+ CALL DSTEQR( JOBZ, N, W, WORK( INDEE ), Z, LDZ,
+ $ WORK( INDWRK ), INFO )
+ IF( INFO.EQ.0 ) THEN
+ DO 30 I = 1, N
+ IFAIL( I ) = 0
+ 30 CONTINUE
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ M = N
+ GO TO 40
+ END IF
+ INFO = 0
+ END IF
+*
+* Otherwise, call DSTEBZ and, if eigenvectors are desired, SSTEIN.
+*
+ IF( WANTZ ) THEN
+ ORDER = 'B'
+ ELSE
+ ORDER = 'E'
+ END IF
+ INDIBL = 1
+ INDISP = INDIBL + N
+ INDIWO = INDISP + N
+ CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+ $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ),
+ $ IWORK( INDIWO ), INFO )
+*
+ IF( WANTZ ) THEN
+ CALL DSTEIN( N, WORK( INDD ), WORK( INDE ), M, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+ $ WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO )
+*
+* Apply orthogonal matrix used in reduction to tridiagonal
+* form to eigenvectors returned by DSTEIN.
+*
+ INDWKN = INDE
+ LLWRKN = LWORK - INDWKN + 1
+ CALL DORMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+ $ LDZ, WORK( INDWKN ), LLWRKN, IINFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ 40 CONTINUE
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = M
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* If eigenvalues are not in order, then sort them, along with
+* eigenvectors.
+*
+ IF( WANTZ ) THEN
+ DO 60 J = 1, M - 1
+ I = 0
+ TMP1 = W( J )
+ DO 50 JJ = J + 1, M
+ IF( W( JJ ).LT.TMP1 ) THEN
+ I = JJ
+ TMP1 = W( JJ )
+ END IF
+ 50 CONTINUE
+*
+ IF( I.NE.0 ) THEN
+ ITMP1 = IWORK( INDIBL+I-1 )
+ W( I ) = W( J )
+ IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+ W( J ) = TMP1
+ IWORK( INDIBL+J-1 ) = ITMP1
+ CALL DSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+ IF( INFO.NE.0 ) THEN
+ ITMP1 = IFAIL( I )
+ IFAIL( I ) = IFAIL( J )
+ IFAIL( J ) = ITMP1
+ END IF
+ END IF
+ 60 CONTINUE
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of DSYEVX_2STAGE
+*
+ END
diff --git a/SRC/dsygv_2stage.f b/SRC/dsygv_2stage.f
new file mode 100644
index 00000000..2c79ec8a
--- /dev/null
+++ b/SRC/dsygv_2stage.f
@@ -0,0 +1,370 @@
+*> \brief \b DSYGV_2STAGE
+*
+* @precisions fortran d -> s
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSYGV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsygv_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsygv_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsygv_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DSYGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+* WORK, LWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, ITYPE, LDA, LDB, LWORK, N
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), W( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DSYGV_2STAGE computes all the eigenvalues, and optionally, the eigenvectors
+*> of a real generalized symmetric-definite eigenproblem, of the form
+*> A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x.
+*> Here A and B are assumed to be symmetric and B is also
+*> positive definite.
+*> This routine use the 2stage technique for the reduction to tridiagonal
+*> which showed higher performance on recent architecture and for large
+* sizes N>2000.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] ITYPE
+*> \verbatim
+*> ITYPE is INTEGER
+*> Specifies the problem type to be solved:
+*> = 1: A*x = (lambda)*B*x
+*> = 2: A*B*x = (lambda)*x
+*> = 3: B*A*x = (lambda)*x
+*> \endverbatim
+*>
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangles of A and B are stored;
+*> = 'L': Lower triangles of A and B are stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrices A and B. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is DOUBLE PRECISION array, dimension (LDA, N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*>
+*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*> matrix Z of eigenvectors. The eigenvectors are normalized
+*> as follows:
+*> if ITYPE = 1 or 2, Z**T*B*Z = I;
+*> if ITYPE = 3, Z**T*inv(B)*Z = I.
+*> If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
+*> or the lower triangle (if UPLO='L') of A, including the
+*> diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*> B is DOUBLE PRECISION array, dimension (LDB, N)
+*> On entry, the symmetric positive definite matrix B.
+*> If UPLO = 'U', the leading N-by-N upper triangular part of B
+*> contains the upper triangular part of the matrix B.
+*> If UPLO = 'L', the leading N-by-N lower triangular part of B
+*> contains the lower triangular part of the matrix B.
+*>
+*> On exit, if INFO <= N, the part of B containing the matrix is
+*> overwritten by the triangular factor U or L from the Cholesky
+*> factorization B = U**T*U or B = L*L**T.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*> LDB is INTEGER
+*> The leading dimension of the array B. LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + 2*N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + 2*N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: DPOTRF or DSYEV returned an error code:
+*> <= N: if INFO = i, DSYEV failed to converge;
+*> i off-diagonal elements of an intermediate
+*> tridiagonal form did not converge to zero;
+*> > N: if INFO = N + i, for 1 <= i <= N, then the leading
+*> minor of order i of B is not positive definite.
+*> The factorization of B could not be completed and
+*> no eigenvalues or eigenvectors were computed.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup doubleSYeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE DSYGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+ $ WORK, LWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, ITYPE, LDA, LDB, LWORK, N
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION A( LDA, * ), B( LDB, * ), W( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ONE
+ PARAMETER ( ONE = 1.0D+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, UPPER, WANTZ
+ CHARACTER TRANS
+ INTEGER NEIG, LWMIN, LHTRD, LWTRD, KD, IB
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL DPOTRF, DSYGST, DTRMM, DTRSM, XERBLA,
+ $ DSYEV_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( ITYPE.LT.1 .OR. ITYPE.GT.3 ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( UPPER .OR. LSAME( UPLO, 'L' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -6
+ ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
+ INFO = -8
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = 2*N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -11
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSYGV_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+* Form a Cholesky factorization of B.
+*
+ CALL DPOTRF( UPLO, N, B, LDB, INFO )
+ IF( INFO.NE.0 ) THEN
+ INFO = N + INFO
+ RETURN
+ END IF
+*
+* Transform problem to standard eigenvalue problem and solve.
+*
+ CALL DSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
+ CALL DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, INFO )
+*
+ IF( WANTZ ) THEN
+*
+* Backtransform eigenvectors to the original problem.
+*
+ NEIG = N
+ IF( INFO.GT.0 )
+ $ NEIG = INFO - 1
+ IF( ITYPE.EQ.1 .OR. ITYPE.EQ.2 ) THEN
+*
+* For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
+* backtransform eigenvectors: x = inv(L)**T*y or inv(U)*y
+*
+ IF( UPPER ) THEN
+ TRANS = 'N'
+ ELSE
+ TRANS = 'T'
+ END IF
+*
+ CALL DTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+ $ B, LDB, A, LDA )
+*
+ ELSE IF( ITYPE.EQ.3 ) THEN
+*
+* For B*A*x=(lambda)*x;
+* backtransform eigenvectors: x = L*y or U**T*y
+*
+ IF( UPPER ) THEN
+ TRANS = 'T'
+ ELSE
+ TRANS = 'N'
+ END IF
+*
+ CALL DTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+ $ B, LDB, A, LDA )
+ END IF
+ END IF
+*
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of DSYGV_2STAGE
+*
+ END
diff --git a/SRC/dsytrd_2stage.f b/SRC/dsytrd_2stage.f
new file mode 100644
index 00000000..449a279e
--- /dev/null
+++ b/SRC/dsytrd_2stage.f
@@ -0,0 +1,337 @@
+*> \brief \b DSYTRD_2STAGE
+*
+* @generated from zhetrd_2stage.f, fortran z -> d, Sun Nov 6 19:34:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSYTRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsytrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsytrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsytrd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
+* HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER VECT, UPLO
+* INTEGER N, LDA, LWORK, LHOUS2, INFO
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION D( * ), E( * )
+* DOUBLE PRECISION A( LDA, * ), TAU( * ),
+* HOUS2( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DSYTRD_2STAGE reduces a real symmetric matrix A to real symmetric
+*> tridiagonal form T by a orthogonal similarity transformation:
+*> Q1**T Q2**T* A * Q2 * Q1 = T.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] VECT
+*> \verbatim
+*> VECT is CHARACTER*1
+*> = 'N': No need for the Housholder representation,
+*> in particular for the second stage (Band to
+*> tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
+*> = 'V': the Householder representation is needed to
+*> either generate Q1 Q2 or to apply Q1 Q2,
+*> then LHOUS2 is to be queried and computed.
+*> (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is DOUBLE PRECISION array, dimension (LDA,N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the leading
+*> N-by-N upper triangular part of A contains the upper
+*> triangular part of the matrix A, and the strictly lower
+*> triangular part of A is not referenced. If UPLO = 'L', the
+*> leading N-by-N lower triangular part of A contains the lower
+*> triangular part of the matrix A, and the strictly upper
+*> triangular part of A is not referenced.
+*> On exit, if UPLO = 'U', the band superdiagonal
+*> of A are overwritten by the corresponding elements of the
+*> internal band-diagonal matrix AB, and the elements above
+*> the KD superdiagonal, with the array TAU, represent the orthogonal
+*> matrix Q1 as a product of elementary reflectors; if UPLO
+*> = 'L', the diagonal and band subdiagonal of A are over-
+*> written by the corresponding elements of the internal band-diagonal
+*> matrix AB, and the elements below the KD subdiagonal, with
+*> the array TAU, represent the orthogonal matrix Q1 as a product
+*> of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*> D is DOUBLE PRECISION array, dimension (N)
+*> The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*> E is DOUBLE PRECISION array, dimension (N-1)
+*> The off-diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*> TAU is DOUBLE PRECISION array, dimension (N-KD)
+*> The scalar factors of the elementary reflectors of
+*> the first stage (see Further Details).
+*> \endverbatim
+*>
+*> \param[out] HOUS2
+*> \verbatim
+*> HOUS2 is DOUBLE PRECISION array, dimension LHOUS2, that
+*> store the Householder representation of the stage2
+*> band to tridiagonal.
+*> \endverbatim
+*>
+*> \param[in] LHOUS2
+*> \verbatim
+*> LHOUS2 is INTEGER
+*> The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS2=-1,
+*> then a query is assumed; the routine
+*> only calculates the optimal size of the HOUS2 array, returns
+*> this value as the first entry of the HOUS2 array, and no error
+*> message related to LHOUS2 is issued by XERBLA.
+*> LHOUS2 = MAX(1, dimension) where
+*> dimension = 4*N if VECT='N'
+*> not available now if VECT='H'
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is DOUBLE PRECISION array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK. LWORK = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS2=-1,
+*> then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup doubleSYcomputational
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
+ $ HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER VECT, UPLO
+ INTEGER N, LDA, LWORK, LHOUS2, INFO
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION D( * ), E( * )
+ DOUBLE PRECISION A( LDA, * ), TAU( * ),
+ $ HOUS2( * ), WORK( * )
+* ..
+*
+* =====================================================================
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, UPPER, WANTQ
+ INTEGER KD, IB, LWMIN, LHMIN, LWRK, LDAB, WPOS, ABPOS
+* ..
+* .. External Subroutines ..
+ EXTERNAL XERBLA, DSYTRD_SY2SB, DSYTRD_SB2ST
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters
+*
+ INFO = 0
+ WANTQ = LSAME( VECT, 'V' )
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS2.EQ.-1 )
+*
+* Determine the block size, the workspace size and the hous size.
+*
+ KD = ILAENV( 17, 'DSYTRD_2STAGE', VECT, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'DSYTRD_2STAGE', VECT, N, KD, -1, -1 )
+ LHMIN = ILAENV( 19, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+ LWMIN = ILAENV( 20, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+* WRITE(*,*),'DSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
+* $ LHMIN, LWMIN
+*
+ IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ ELSE IF( LHOUS2.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -10
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -12
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ HOUS2( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSYTRD_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 ) THEN
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+* Determine pointer position
+*
+ LDAB = KD+1
+ LWRK = LWORK-LDAB*N
+ ABPOS = 1
+ WPOS = ABPOS + LDAB*N
+ CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
+ $ TAU, WORK( WPOS ), LWRK, INFO )
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSYTRD_SY2SB', -INFO )
+ RETURN
+ END IF
+ CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD,
+ $ WORK( ABPOS ), LDAB, D, E,
+ $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSYTRD_SB2ST', -INFO )
+ RETURN
+ END IF
+*
+*
+ HOUS2( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of DSYTRD_2STAGE
+*
+ END
diff --git a/SRC/dsytrd_sb2st.F b/SRC/dsytrd_sb2st.F
new file mode 100644
index 00000000..d50debe1
--- /dev/null
+++ b/SRC/dsytrd_sb2st.F
@@ -0,0 +1,603 @@
+*> \brief \b DSBTRD
+*
+* @generated from zhetrd_hb2st.F, fortran z -> d, Sun Nov 6 19:34:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSBTRD + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsbtrd.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsbtrd.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsbtrd.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
+* D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+* #define PRECISION_REAL
+*
+* #if defined(_OPENMP)
+* use omp_lib
+* #endif
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER STAGE1, UPLO, VECT
+* INTEGER N, KD, IB, LDAB, LHOUS, LWORK, INFO
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION D( * ), E( * )
+* DOUBLE PRECISION AB( LDAB, * ), HOUS( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DSBTRD reduces a real symmetric band matrix A to real symmetric
+*> tridiagonal form T by a orthogonal similarity transformation:
+*> Q**T * A * Q = T.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] STAGE
+*> \verbatim
+*> STAGE is CHARACTER*1
+*> = 'N': "No": to mention that the stage 1 of the reduction
+*> from dense to band using the dsytrd_sy2sb routine
+*> was not called before this routine to reproduce AB.
+*> In other term this routine is called as standalone.
+*> = 'Y': "Yes": to mention that the stage 1 of the
+*> reduction from dense to band using the dsytrd_sy2sb
+*> routine has been called to produce AB (e.g., AB is
+*> the output of dsytrd_sy2sb.
+*> \endverbatim
+*>
+*> \param[in] VECT
+*> \verbatim
+*> VECT is CHARACTER*1
+*> = 'N': No need for the Housholder representation,
+*> and thus LHOUS is of size max(1, 4*N);
+*> = 'V': the Householder representation is needed to
+*> either generate or to apply Q later on,
+*> then LHOUS is to be queried and computed.
+*> (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is DOUBLE PRECISION array, dimension (LDAB,N)
+*> On entry, the upper or lower triangle of the symmetric band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*> On exit, the diagonal elements of AB are overwritten by the
+*> diagonal elements of the tridiagonal matrix T; if KD > 0, the
+*> elements on the first superdiagonal (if UPLO = 'U') or the
+*> first subdiagonal (if UPLO = 'L') are overwritten by the
+*> off-diagonal elements of T; the rest of AB is overwritten by
+*> values generated during the reduction.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*> D is DOUBLE PRECISION array, dimension (N)
+*> The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*> E is DOUBLE PRECISION array, dimension (N-1)
+*> The off-diagonal elements of the tridiagonal matrix T:
+*> E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
+*> \endverbatim
+*>
+*> \param[out] HOUS
+*> \verbatim
+*> HOUS is DOUBLE PRECISION array, dimension LHOUS, that
+*> store the Householder representation.
+*> \endverbatim
+*>
+*> \param[in] LHOUS
+*> \verbatim
+*> LHOUS is INTEGER
+*> The dimension of the array HOUS. LHOUS = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS=-1,
+*> then a query is assumed; the routine
+*> only calculates the optimal size of the HOUS array, returns
+*> this value as the first entry of the HOUS array, and no error
+*> message related to LHOUS is issued by XERBLA.
+*> LHOUS = MAX(1, dimension) where
+*> dimension = 4*N if VECT='N'
+*> not available now if VECT='H'
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is DOUBLE PRECISION array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK. LWORK = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS=-1,
+*> then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup real16OTHERcomputational
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
+ $ D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+#define PRECISION_REAL
+*
+#if defined(_OPENMP)
+ use omp_lib
+#endif
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER STAGE1, UPLO, VECT
+ INTEGER N, KD, LDAB, LHOUS, LWORK, INFO
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION D( * ), E( * )
+ DOUBLE PRECISION AB( LDAB, * ), HOUS( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION RZERO
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( RZERO = 0.0D+0,
+ $ ZERO = 0.0D+0,
+ $ ONE = 1.0D+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, WANTQ, UPPER, AFTERS1
+ INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
+ $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
+ $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
+ $ NBTILES, TTYPE, TID, NTHREADS, DEBUG,
+ $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
+ $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
+ $ SIDEV, SIZETAU, LDV, LHMIN, LWMIN
+#if defined (PRECISION_COMPLEX)
+ DOUBLE PRECISION ABSTMP
+ DOUBLE PRECISION TMP
+#endif
+* ..
+* .. External Subroutines ..
+ EXTERNAL DSB2ST_KERNELS, DLACPY, DLASET
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN, MAX, CEILING, REAL
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Determine the minimal workspace size required.
+* Test the input parameters
+*
+ DEBUG = 0
+ INFO = 0
+ AFTERS1 = LSAME( STAGE1, 'Y' )
+ WANTQ = LSAME( VECT, 'V' )
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 )
+*
+* Determine the block size, the workspace size and the hous size.
+*
+ IB = ILAENV( 18, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
+ LHMIN = ILAENV( 19, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+ LWMIN = ILAENV( 20, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+*
+ IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -5
+ ELSE IF( LDAB.LT.(KD+1) ) THEN
+ INFO = -7
+ ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -11
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -13
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ HOUS( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSYTRD_SB2ST', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 ) THEN
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+* Determine pointer position
+*
+ LDV = KD + IB
+ SIZETAU = 2 * N
+ SIDEV = 2 * N
+ INDTAU = 1
+ INDV = INDTAU + SIZETAU
+ LDA = 2 * KD + 1
+ SIZEA = LDA * N
+ INDA = 1
+ INDW = INDA + SIZEA
+ NTHREADS = 1
+ TID = 0
+*
+ IF( UPPER ) THEN
+ APOS = INDA + KD
+ AWPOS = INDA
+ DPOS = APOS + KD
+ OFDPOS = DPOS - 1
+ ABDPOS = KD + 1
+ ABOFDPOS = KD
+ ELSE
+ APOS = INDA
+ AWPOS = INDA + KD + 1
+ DPOS = APOS
+ OFDPOS = DPOS + 1
+ ABDPOS = 1
+ ABOFDPOS = 2
+
+ ENDIF
+*
+* Case KD=0:
+* The matrix is diagonal. We just copy it (convert to "real" for
+* real because D is double and the imaginary part should be 0)
+* and store it in D. A sequential code here is better or
+* in a parallel environment it might need two cores for D and E
+*
+ IF( KD.EQ.0 ) THEN
+ DO 30 I = 1, N
+ D( I ) = ( AB( ABDPOS, I ) )
+ 30 CONTINUE
+ DO 40 I = 1, N-1
+ E( I ) = RZERO
+ 40 CONTINUE
+ GOTO 200
+ END IF
+*
+* Case KD=1:
+* The matrix is already Tridiagonal. We have to make diagonal
+* and offdiagonal elements real, and store them in D and E.
+* For that, for real precision just copy the diag and offdiag
+* to D and E while for the COMPLEX case the bulge chasing is
+* performed to convert the hermetian tridiagonal to symmetric
+* tridiagonal. A simpler coversion formula might be used, but then
+* updating the Q matrix will be required and based if Q is generated
+* or not this might complicate the story.
+*
+C IF( KD.EQ.1 .AND. N.GT.(KD+1) .AND. AFTERS1 ) THEN
+ IF( KD.EQ.1 ) THEN
+ DO 50 I = 1, N
+ D( I ) = ( AB( ABDPOS, I ) )
+ 50 CONTINUE
+#if defined (PRECISION_COMPLEX)
+*
+* make off-diagonal elements real and copy them to E
+*
+ IF( UPPER ) THEN
+ DO 60 I = 1, N - 1
+ TMP = AB( ABOFDPOS, I+1 )
+ ABSTMP = ABS( TMP )
+ AB( ABOFDPOS, I+1 ) = ABSTMP
+ E( I ) = ABSTMP
+ IF( ABSTMP.NE.RZERO ) THEN
+ TMP = TMP / ABSTMP
+ ELSE
+ TMP = ONE
+ END IF
+ IF( I.LT.N-1 )
+ $ AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP
+C IF( WANTZ ) THEN
+C CALL DSCAL( N, ( TMP ), Q( 1, I+1 ), 1 )
+C END IF
+ 60 CONTINUE
+ ELSE
+ DO 70 I = 1, N - 1
+ TMP = AB( ABOFDPOS, I )
+ ABSTMP = ABS( TMP )
+ AB( ABOFDPOS, I ) = ABSTMP
+ E( I ) = ABSTMP
+ IF( ABSTMP.NE.RZERO ) THEN
+ TMP = TMP / ABSTMP
+ ELSE
+ TMP = ONE
+ END IF
+ IF( I.LT.N-1 )
+ $ AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP
+C IF( WANTQ ) THEN
+C CALL DSCAL( N, TMP, Q( 1, I+1 ), 1 )
+C END IF
+ 70 CONTINUE
+ ENDIF
+#else
+ IF( UPPER ) THEN
+ DO 60 I = 1, N-1
+ E( I ) = ( AB( ABOFDPOS, I+1 ) )
+ 60 CONTINUE
+ ELSE
+ DO 70 I = 1, N-1
+ E( I ) = ( AB( ABOFDPOS, I ) )
+ 70 CONTINUE
+ ENDIF
+#endif
+ GOTO 200
+ END IF
+*
+* Main code start here.
+* Reduce the symmetric band of A to a tridiagonal matrix.
+*
+ THGRSIZ = N
+ GRSIZ = 1
+ SHIFT = 3
+ NBTILES = CEILING( REAL(N)/REAL(KD) )
+ STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
+ THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) )
+*
+ CALL DLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
+ CALL DLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA )
+*
+*
+* openMP parallelisation start here
+*
+#if defined(_OPENMP)
+!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
+!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
+!$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
+!$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
+!$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
+!$OMP$ SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+!$OMP MASTER
+#endif
+*
+* main bulge chasing loop
+*
+ DO 100 THGRID = 1, THGRNB
+ STT = (THGRID-1)*THGRSIZ+1
+ THED = MIN( (STT + THGRSIZ -1), (N-1))
+ DO 110 I = STT, N-1
+ ED = MIN( I, THED )
+ IF( STT.GT.ED ) GOTO 100
+ DO 120 M = 1, STEPERCOL
+ ST = STT
+ DO 130 SWEEPID = ST, ED
+ DO 140 K = 1, GRSIZ
+ MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ)
+ $ + (M-1)*GRSIZ + K
+ IF ( MYID.EQ.1 ) THEN
+ TTYPE = 1
+ ELSE
+ TTYPE = MOD( MYID, 2 ) + 2
+ ENDIF
+
+ IF( TTYPE.EQ.2 ) THEN
+ COLPT = (MYID/2)*KD + SWEEPID
+ STIND = COLPT-KD+1
+ EDIND = MIN(COLPT,N)
+ BLKLASTIND = COLPT
+ ELSE
+ COLPT = ((MYID+1)/2)*KD + SWEEPID
+ STIND = COLPT-KD+1
+ EDIND = MIN(COLPT,N)
+ IF( ( STIND.GE.EDIND-1 ).AND.
+ $ ( EDIND.EQ.N ) ) THEN
+ BLKLASTIND = N
+ ELSE
+ BLKLASTIND = 0
+ ENDIF
+ ENDIF
+*
+* Call the kernel
+*
+#if defined(_OPENMP)
+ IF( TTYPE.NE.1 ) THEN
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$ DEPEND(in:WORK(MYID-1))
+!$OMP$ DEPEND(out:WORK(MYID))
+ TID = OMP_GET_THREAD_NUM()
+ CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
+ $ STIND, EDIND, SWEEPID, N, KD, IB,
+ $ WORK ( INDA ), LDA,
+ $ HOUS( INDV ), HOUS( INDTAU ), LDV,
+ $ WORK( INDW + TID*KD ) )
+!$OMP END TASK
+ ELSE
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$ DEPEND(out:WORK(MYID))
+ TID = OMP_GET_THREAD_NUM()
+ CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
+ $ STIND, EDIND, SWEEPID, N, KD, IB,
+ $ WORK ( INDA ), LDA,
+ $ HOUS( INDV ), HOUS( INDTAU ), LDV,
+ $ WORK( INDW + TID*KD ) )
+!$OMP END TASK
+ ENDIF
+#else
+ CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
+ $ STIND, EDIND, SWEEPID, N, KD, IB,
+ $ WORK ( INDA ), LDA,
+ $ HOUS( INDV ), HOUS( INDTAU ), LDV,
+ $ WORK( INDW + TID*KD ) )
+#endif
+ IF ( BLKLASTIND.GE.(N-1) ) THEN
+ STT = STT + 1
+ GOTO 130
+ ENDIF
+ 140 CONTINUE
+ 130 CONTINUE
+ 120 CONTINUE
+ 110 CONTINUE
+ 100 CONTINUE
+*
+#if defined(_OPENMP)
+!$OMP END MASTER
+!$OMP END PARALLEL
+#endif
+*
+* Copy the diagonal from A to D. Note that D is REAL thus only
+* the Real part is needed, the imaginary part should be zero.
+*
+ DO 150 I = 1, N
+ D( I ) = ( WORK( DPOS+(I-1)*LDA ) )
+ 150 CONTINUE
+*
+* Copy the off diagonal from A to E. Note that E is REAL thus only
+* the Real part is needed, the imaginary part should be zero.
+*
+ IF( UPPER ) THEN
+ DO 160 I = 1, N-1
+ E( I ) = ( WORK( OFDPOS+I*LDA ) )
+ 160 CONTINUE
+ ELSE
+ DO 170 I = 1, N-1
+ E( I ) = ( WORK( OFDPOS+(I-1)*LDA ) )
+ 170 CONTINUE
+ ENDIF
+*
+ 200 CONTINUE
+*
+ HOUS( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of DSYTRD_SB2ST
+*
+ END
+#undef PRECISION_REAL
+
diff --git a/SRC/dsytrd_sy2sb.f b/SRC/dsytrd_sy2sb.f
new file mode 100644
index 00000000..8f0261df
--- /dev/null
+++ b/SRC/dsytrd_sy2sb.f
@@ -0,0 +1,517 @@
+*> \brief \b DSYTRD_SY2SB
+*
+* @generated from zhetrd_he2hb.f, fortran z -> d, Sun Nov 6 19:34:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSYTRD_SY2SB + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsytrd.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsytrd.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsytrd.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DSYTRD_SY2SB( UPLO, N, KD, A, LDA, AB, LDAB, TAU,
+* WORK, LWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER UPLO
+* INTEGER INFO, LDA, LDAB, LWORK, N, KD
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION A( LDA, * ), AB( LDAB, * ),
+* TAU( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DSYTRD_SY2SB reduces a real symmetric matrix A to real symmetric
+*> band-diagonal form AB by a orthogonal similarity transformation:
+*> Q**T * A * Q = AB.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the reduced matrix if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> The reduced matrix is stored in the array AB.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is DOUBLE PRECISION array, dimension (LDA,N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the leading
+*> N-by-N upper triangular part of A contains the upper
+*> triangular part of the matrix A, and the strictly lower
+*> triangular part of A is not referenced. If UPLO = 'L', the
+*> leading N-by-N lower triangular part of A contains the lower
+*> triangular part of the matrix A, and the strictly upper
+*> triangular part of A is not referenced.
+*> On exit, if UPLO = 'U', the diagonal and first superdiagonal
+*> of A are overwritten by the corresponding elements of the
+*> tridiagonal matrix T, and the elements above the first
+*> superdiagonal, with the array TAU, represent the orthogonal
+*> matrix Q as a product of elementary reflectors; if UPLO
+*> = 'L', the diagonal and first subdiagonal of A are over-
+*> written by the corresponding elements of the tridiagonal
+*> matrix T, and the elements below the first subdiagonal, with
+*> the array TAU, represent the orthogonal matrix Q as a product
+*> of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] AB
+*> \verbatim
+*> AB is DOUBLE PRECISION array, dimension (LDAB,N)
+*> On exit, the upper or lower triangle of the symmetric band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*> TAU is DOUBLE PRECISION array, dimension (N-KD)
+*> The scalar factors of the elementary reflectors (see Further
+*> Details).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is DOUBLE PRECISION array, dimension LWORK.
+*> On exit, if INFO = 0, or if LWORK=-1,
+*> WORK(1) returns the size of LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK which should be calculated
+* by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> LWORK_QUERY = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD
+*> where FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice otherwise
+*> putting LWORK=-1 will provide the size of WORK.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup doubleSYcomputational
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+*> \verbatim
+*>
+*> If UPLO = 'U', the matrix Q is represented as a product of elementary
+*> reflectors
+*>
+*> Q = H(k)**T . . . H(2)**T H(1)**T, where k = n-kd.
+*>
+*> Each H(i) has the form
+*>
+*> H(i) = I - tau * v * v**T
+*>
+*> where tau is a real scalar, and v is a real vector with
+*> v(1:i+kd-1) = 0 and v(i+kd) = 1; conjg(v(i+kd+1:n)) is stored on exit in
+*> A(i,i+kd+1:n), and tau in TAU(i).
+*>
+*> If UPLO = 'L', the matrix Q is represented as a product of elementary
+*> reflectors
+*>
+*> Q = H(1) H(2) . . . H(k), where k = n-kd.
+*>
+*> Each H(i) has the form
+*>
+*> H(i) = I - tau * v * v**T
+*>
+*> where tau is a real scalar, and v is a real vector with
+*> v(kd+1:i) = 0 and v(i+kd+1) = 1; v(i+kd+2:n) is stored on exit in
+* A(i+kd+2:n,i), and tau in TAU(i).
+*>
+*> The contents of A on exit are illustrated by the following examples
+*> with n = 5:
+*>
+*> if UPLO = 'U': if UPLO = 'L':
+*>
+*> ( ab ab/v1 v1 v1 v1 ) ( ab )
+*> ( ab ab/v2 v2 v2 ) ( ab/v1 ab )
+*> ( ab ab/v3 v3 ) ( v1 ab/v2 ab )
+*> ( ab ab/v4 ) ( v1 v2 ab/v3 ab )
+*> ( ab ) ( v1 v2 v3 ab/v4 ab )
+*>
+*> where d and e denote diagonal and off-diagonal elements of T, and vi
+*> denotes an element of the vector defining H(i).
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE DSYTRD_SY2SB( UPLO, N, KD, A, LDA, AB, LDAB, TAU,
+ $ WORK, LWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER UPLO
+ INTEGER INFO, LDA, LDAB, LWORK, N, KD
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION A( LDA, * ), AB( LDAB, * ),
+ $ TAU( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION RONE
+ DOUBLE PRECISION ZERO, ONE, HALF
+ PARAMETER ( RONE = 1.0D+0,
+ $ ZERO = 0.0D+0,
+ $ ONE = 1.0D+0,
+ $ HALF = 0.5D+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, UPPER
+ INTEGER I, J, IINFO, LWMIN, PN, PK, LK,
+ $ LDT, LDW, LDS2, LDS1,
+ $ LS2, LS1, LW, LT,
+ $ TPOS, WPOS, S2POS, S1POS
+* ..
+* .. External Subroutines ..
+ EXTERNAL XERBLA, DSYR2K, DSYMM, DGEMM,
+ $ DLARFT, DGELQF, DGEQRF, DLASET
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN, MAX
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Determine the minimal workspace size required
+* and test the input parameters
+*
+ INFO = 0
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 )
+ LWMIN = ILAENV( 20, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
+
+ IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+ INFO = -1
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -2
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ ELSE IF( LDAB.LT.MAX( 1, KD+1 ) ) THEN
+ INFO = -7
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -10
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'DSYTRD_SY2SB', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ WORK( 1 ) = LWMIN
+ RETURN
+ END IF
+*
+* Quick return if possible
+* Copy the upper/lower portion of A into AB
+*
+ IF( N.LE.KD+1 ) THEN
+ IF( UPPER ) THEN
+ DO 100 I = 1, N
+ LK = MIN( KD+1, I )
+ CALL DCOPY( LK, A( I-LK+1, I ), 1,
+ $ AB( KD+1-LK+1, I ), 1 )
+ 100 CONTINUE
+ ELSE
+ DO 110 I = 1, N
+ LK = MIN( KD+1, N-I+1 )
+ CALL DCOPY( LK, A( I, I ), 1, AB( 1, I ), 1 )
+ 110 CONTINUE
+ ENDIF
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+* Determine the pointer position for the workspace
+*
+ LDT = KD
+ LDS1 = KD
+ LT = LDT*KD
+ LW = N*KD
+ LS1 = LDS1*KD
+ LS2 = LWMIN - LT - LW - LS1
+* LS2 = N*MAX(KD,FACTOPTNB)
+ TPOS = 1
+ WPOS = TPOS + LT
+ S1POS = WPOS + LW
+ S2POS = S1POS + LS1
+ IF( UPPER ) THEN
+ LDW = KD
+ LDS2 = KD
+ ELSE
+ LDW = N
+ LDS2 = N
+ ENDIF
+*
+*
+* Set the workspace of the triangular matrix T to zero once such a
+* way everytime T is generated the upper/lower portion will be always zero
+*
+ CALL DLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT )
+*
+ IF( UPPER ) THEN
+ DO 10 I = 1, N - KD, KD
+ PN = N-I-KD+1
+ PK = MIN( N-I-KD+1, KD )
+*
+* Compute the LQ factorization of the current block
+*
+ CALL DGELQF( KD, PN, A( I, I+KD ), LDA,
+ $ TAU( I ), WORK( S2POS ), LS2, IINFO )
+*
+* Copy the upper portion of A into AB
+*
+ DO 20 J = I, I+PK-1
+ LK = MIN( KD, N-J ) + 1
+ CALL DCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+ 20 CONTINUE
+*
+ CALL DLASET( 'Lower', PK, PK, ZERO, ONE,
+ $ A( I, I+KD ), LDA )
+*
+* Form the matrix T
+*
+ CALL DLARFT( 'Forward', 'Rowwise', PN, PK,
+ $ A( I, I+KD ), LDA, TAU( I ),
+ $ WORK( TPOS ), LDT )
+*
+* Compute W:
+*
+ CALL DGEMM( 'Conjugate', 'No transpose', PK, PN, PK,
+ $ ONE, WORK( TPOS ), LDT,
+ $ A( I, I+KD ), LDA,
+ $ ZERO, WORK( S2POS ), LDS2 )
+*
+ CALL DSYMM( 'Right', UPLO, PK, PN,
+ $ ONE, A( I+KD, I+KD ), LDA,
+ $ WORK( S2POS ), LDS2,
+ $ ZERO, WORK( WPOS ), LDW )
+*
+ CALL DGEMM( 'No transpose', 'Conjugate', PK, PK, PN,
+ $ ONE, WORK( WPOS ), LDW,
+ $ WORK( S2POS ), LDS2,
+ $ ZERO, WORK( S1POS ), LDS1 )
+*
+ CALL DGEMM( 'No transpose', 'No transpose', PK, PN, PK,
+ $ -HALF, WORK( S1POS ), LDS1,
+ $ A( I, I+KD ), LDA,
+ $ ONE, WORK( WPOS ), LDW )
+*
+*
+* Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+* an update of the form: A := A - V'*W - W'*V
+*
+ CALL DSYR2K( UPLO, 'Conjugate', PN, PK,
+ $ -ONE, A( I, I+KD ), LDA,
+ $ WORK( WPOS ), LDW,
+ $ RONE, A( I+KD, I+KD ), LDA )
+ 10 CONTINUE
+*
+* Copy the upper band to AB which is the band storage matrix
+*
+ DO 30 J = N-KD+1, N
+ LK = MIN(KD, N-J) + 1
+ CALL DCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+ 30 CONTINUE
+*
+ ELSE
+*
+* Reduce the lower triangle of A to lower band matrix
+*
+ DO 40 I = 1, N - KD, KD
+ PN = N-I-KD+1
+ PK = MIN( N-I-KD+1, KD )
+*
+* Compute the QR factorization of the current block
+*
+ CALL DGEQRF( PN, KD, A( I+KD, I ), LDA,
+ $ TAU( I ), WORK( S2POS ), LS2, IINFO )
+*
+* Copy the upper portion of A into AB
+*
+ DO 50 J = I, I+PK-1
+ LK = MIN( KD, N-J ) + 1
+ CALL DCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+ 50 CONTINUE
+*
+ CALL DLASET( 'Upper', PK, PK, ZERO, ONE,
+ $ A( I+KD, I ), LDA )
+*
+* Form the matrix T
+*
+ CALL DLARFT( 'Forward', 'Columnwise', PN, PK,
+ $ A( I+KD, I ), LDA, TAU( I ),
+ $ WORK( TPOS ), LDT )
+*
+* Compute W:
+*
+ CALL DGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+ $ ONE, A( I+KD, I ), LDA,
+ $ WORK( TPOS ), LDT,
+ $ ZERO, WORK( S2POS ), LDS2 )
+*
+ CALL DSYMM( 'Left', UPLO, PN, PK,
+ $ ONE, A( I+KD, I+KD ), LDA,
+ $ WORK( S2POS ), LDS2,
+ $ ZERO, WORK( WPOS ), LDW )
+*
+ CALL DGEMM( 'Conjugate', 'No transpose', PK, PK, PN,
+ $ ONE, WORK( S2POS ), LDS2,
+ $ WORK( WPOS ), LDW,
+ $ ZERO, WORK( S1POS ), LDS1 )
+*
+ CALL DGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+ $ -HALF, A( I+KD, I ), LDA,
+ $ WORK( S1POS ), LDS1,
+ $ ONE, WORK( WPOS ), LDW )
+*
+*
+* Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+* an update of the form: A := A - V*W' - W*V'
+*
+ CALL DSYR2K( UPLO, 'No transpose', PN, PK,
+ $ -ONE, A( I+KD, I ), LDA,
+ $ WORK( WPOS ), LDW,
+ $ RONE, A( I+KD, I+KD ), LDA )
+* ==================================================================
+* RESTORE A FOR COMPARISON AND CHECKING TO BE REMOVED
+* DO 45 J = I, I+PK-1
+* LK = MIN( KD, N-J ) + 1
+* CALL DCOPY( LK, AB( 1, J ), 1, A( J, J ), 1 )
+* 45 CONTINUE
+* ==================================================================
+ 40 CONTINUE
+*
+* Copy the lower band to AB which is the band storage matrix
+*
+ DO 60 J = N-KD+1, N
+ LK = MIN(KD, N-J) + 1
+ CALL DCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+ 60 CONTINUE
+
+ END IF
+*
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of DSYTRD_SY2SB
+*
+ END
diff --git a/SRC/ilaenv.f b/SRC/ilaenv.f
index 42a380cf..c66f1679 100644
--- a/SRC/ilaenv.f
+++ b/SRC/ilaenv.f
@@ -189,7 +189,8 @@
* .. Executable Statements ..
*
GO TO ( 10, 10, 10, 80, 90, 100, 110, 120,
- $ 130, 140, 150, 160, 160, 160, 160, 160 )ISPEC
+ $ 130, 140, 150, 160, 160, 160, 160, 160,
+ $ 170, 170, 170, 170, 170 )ISPEC
*
* Invalid value for ISPEC
*
@@ -690,6 +691,13 @@
ILAENV = IPARMQ( ISPEC, NAME, OPTS, N1, N2, N3, N4 )
RETURN
*
+ 170 CONTINUE
+*
+* 17 <= ISPEC <= 21: 2stage eigenvalues and SVD or related subroutines.
+*
+ ILAENV = IPARAM2STAGE( ISPEC, NAME, OPTS, N1, N2, N3, N4 )
+ RETURN
+*
* End of ILAENV
*
END
diff --git a/SRC/iparam2stage.F b/SRC/iparam2stage.F
new file mode 100644
index 00000000..6443f16e
--- /dev/null
+++ b/SRC/iparam2stage.F
@@ -0,0 +1,388 @@
+*> \brief \b IPARAM2STAGE
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download IPARAM2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/iparam2stage.F">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/iparam2stage.F">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/iparam2stage.F">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* INTEGER FUNCTION IPARAM2STAGE( ISPEC, NAME, OPTS,
+* NI, NBI, IBI, NXI )
+* #if defined(_OPENMP)
+* use omp_lib
+* #endif
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER*( * ) NAME, OPTS
+* INTEGER ISPEC, NI, NBI, IBI, NXI
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> This program sets problem and machine dependent parameters
+*> useful for xHETRD_2STAGE, xHETRD_H@2HB, xHETRD_HB2ST,
+*> xGEBRD_2STAGE, xGEBRD_GE2GB, xGEBRD_GB2BD
+*> and related subroutines for eigenvalue problems.
+*> It is called whenever ILAENV is called with 17 <= ISPEC <= 21
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] ISPEC
+*> \verbatim
+*> ISPEC is integer scalar
+*> ISPEC specifies which tunable parameter IPARAM2STAGE should
+*> return.
+*>
+*> ISPEC=17: the optimal blocksize nb for the reduction to
+* BAND
+*>
+*> ISPEC=18: the optimal blocksize ib for the eigenvectors
+*> singular vectors update routine
+*>
+*> ISPEC=19: The length of the array that store the Housholder
+*> representation for the second stage
+*> Band to Tridiagonal or Bidiagonal
+*>
+*> ISPEC=20: The workspace needed for the routine in input.
+*>
+*> ISPEC=21: For future release.
+*> \endverbatim
+*>
+*> \param[in] NAME
+*> \verbatim
+*> NAME is character string
+*> Name of the calling subroutine
+*> \endverbatim
+*>
+*> \param[in] OPTS
+*> \verbatim
+*> OPTS is CHARACTER*(*)
+*> The character options to the subroutine NAME, concatenated
+*> into a single character string. For example, UPLO = 'U',
+*> TRANS = 'T', and DIAG = 'N' for a triangular routine would
+*> be specified as OPTS = 'UTN'.
+*> \endverbatim
+*>
+*> \param[in] NI
+*> \verbatim
+*> NI is INTEGER which is the size of the matrix
+*> \endverbatim
+*>
+*> \param[in] NBI
+*> \verbatim
+*> NBI is INTEGER which is the used in the reduciton,
+* (e.g., the size of the band), needed to compute workspace
+* and LHOUS2.
+*> \endverbatim
+*>
+*> \param[in] IBI
+*> \verbatim
+*> IBI is INTEGER which represent the IB of the reduciton,
+* needed to compute workspace and LHOUS2.
+*> \endverbatim
+*>
+*> \param[in] NXI
+*> \verbatim
+*> NXI is INTEGER needed in the future release.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup auxOTHERauxiliary
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All detail are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ INTEGER FUNCTION IPARAM2STAGE( ISPEC, NAME, OPTS,
+ $ NI, NBI, IBI, NXI )
+#if defined(_OPENMP)
+ use omp_lib
+#endif
+ IMPLICIT NONE
+*
+* -- LAPACK auxiliary routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER*( * ) NAME, OPTS
+ INTEGER ISPEC, NI, NBI, IBI, NXI
+*
+* ================================================================
+* ..
+* .. Local Scalars ..
+ INTEGER I, IC, IZ, KD, IB, LHOUS, LWORK, NTHREADS,
+ $ FACTOPTNB, QROPTNB, LQOPTNB
+ LOGICAL RPREC, CPREC
+ CHARACTER PREC*1, ALGO*3, STAG*5, SUBNAM*12, VECT*3
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC CHAR, ICHAR, MAX
+* ..
+* .. External Functions ..
+ INTEGER ILAENV
+ EXTERNAL ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Invalid value for ISPEC
+*
+ IF( (ISPEC.LT.17).OR.(ISPEC.GT.21) ) THEN
+ IPARAM2STAGE = -1
+ RETURN
+ ENDIF
+*
+* Get the number of threads
+*
+ NTHREADS = 1
+#if defined(_OPENMP)
+!$OMP PARALLEL
+ NTHREADS = OMP_GET_NUM_THREADS()
+!$OMP END PARALLEL
+#endif
+* WRITE(*,*) 'IPARAM VOICI NTHREADS ISPEC ',NTHREADS, ISPEC
+ IF( ISPEC.EQ.19 ) GOTO 19
+*
+* Convert NAME to upper case if the first character is lower case.
+*
+ IPARAM2STAGE = -1
+ SUBNAM = NAME
+ IC = ICHAR( SUBNAM( 1: 1 ) )
+ IZ = ICHAR( 'Z' )
+ IF( IZ.EQ.90 .OR. IZ.EQ.122 ) THEN
+*
+* ASCII character set
+*
+ IF( IC.GE.97 .AND. IC.LE.122 ) THEN
+ SUBNAM( 1: 1 ) = CHAR( IC-32 )
+ DO 100 I = 2, 12
+ IC = ICHAR( SUBNAM( I: I ) )
+ IF( IC.GE.97 .AND. IC.LE.122 )
+ $ SUBNAM( I: I ) = CHAR( IC-32 )
+ 100 CONTINUE
+ END IF
+*
+ ELSE IF( IZ.EQ.233 .OR. IZ.EQ.169 ) THEN
+*
+* EBCDIC character set
+*
+ IF( ( IC.GE.129 .AND. IC.LE.137 ) .OR.
+ $ ( IC.GE.145 .AND. IC.LE.153 ) .OR.
+ $ ( IC.GE.162 .AND. IC.LE.169 ) ) THEN
+ SUBNAM( 1: 1 ) = CHAR( IC+64 )
+ DO 110 I = 2, 12
+ IC = ICHAR( SUBNAM( I: I ) )
+ IF( ( IC.GE.129 .AND. IC.LE.137 ) .OR.
+ $ ( IC.GE.145 .AND. IC.LE.153 ) .OR.
+ $ ( IC.GE.162 .AND. IC.LE.169 ) )SUBNAM( I:
+ $ I ) = CHAR( IC+64 )
+ 110 CONTINUE
+ END IF
+*
+ ELSE IF( IZ.EQ.218 .OR. IZ.EQ.250 ) THEN
+*
+* Prime machines: ASCII+128
+*
+ IF( IC.GE.225 .AND. IC.LE.250 ) THEN
+ SUBNAM( 1: 1 ) = CHAR( IC-32 )
+ DO 120 I = 2, 12
+ IC = ICHAR( SUBNAM( I: I ) )
+ IF( IC.GE.225 .AND. IC.LE.250 )
+ $ SUBNAM( I: I ) = CHAR( IC-32 )
+ 120 CONTINUE
+ END IF
+ END IF
+*
+ PREC = SUBNAM( 1: 1 )
+ ALGO = SUBNAM( 4: 6 )
+ STAG = SUBNAM( 8:12 )
+ RPREC = PREC.EQ.'S' .OR. PREC.EQ.'D'
+ CPREC = PREC.EQ.'C' .OR. PREC.EQ.'Z'
+*
+* Invalid value for PRECISION
+*
+ IF( .NOT.( RPREC .OR. CPREC ) ) THEN
+ IPARAM2STAGE = -1
+ RETURN
+ ENDIF
+* WRITE(*,*),'RPREC,CPREC ',RPREC,CPREC,
+* $ ' ALGO ',ALGO,' STAGE ',STAG
+*
+ GO TO ( 17, 17, 19, 20, 21 ) ISPEC-16
+*
+ 17 CONTINUE
+*
+* ISPEC = 17, 18: block size KD, IB
+* Could be also dependent from N but for now it
+* depend only on sequential or parallel
+*
+ IF( NTHREADS.GT.4 ) THEN
+ IF( CPREC ) THEN
+ KD = 128
+ IB = 32
+ ELSE
+ KD = 160
+ IB = 40
+ ENDIF
+ ELSE IF( NTHREADS.GT.1 ) THEN
+ IF( CPREC ) THEN
+ KD = 64
+ IB = 32
+ ELSE
+ KD = 64
+ IB = 32
+ ENDIF
+ ELSE
+ IF( CPREC ) THEN
+ KD = 16
+ IB = 16
+ ELSE
+ KD = 32
+ IB = 16
+ ENDIF
+ ENDIF
+ IF( ISPEC.EQ.17 ) IPARAM2STAGE = KD
+ IF( ISPEC.EQ.18 ) IPARAM2STAGE = IB
+ RETURN
+*
+ 19 CONTINUE
+*
+* ISPEC = 19:
+* LHOUS length of the Houselholder representation
+* matrix (V,T) of the second stage. should be >= 1.
+*
+* Will add the VECT OPTION HERE next release
+ VECT = OPTS(1:1)
+ IF( VECT.EQ.'N' ) THEN
+ LHOUS = MAX( 1, 4*NI )
+ ELSE
+* This is not correct, it need to call the ALGO and the stage2
+ LHOUS = MAX( 1, 4*NI ) + IBI
+ ENDIF
+ IF( LHOUS.GE.0 ) THEN
+ IPARAM2STAGE = LHOUS
+ ELSE
+ IPARAM2STAGE = -1
+ ENDIF
+ RETURN
+*
+ 20 CONTINUE
+*
+* ISPEC = 20: (21 for future use)
+* LWORK length of the workspace for
+* either or both stages for TRD and BRD. should be >= 1.
+* TRD:
+* TRD_stage 1: = LT + LW + LS1 + LS2
+* = LDT*KD + N*KD + N*MAX(KD,FACTOPTNB) + LDS2*KD
+* where LDT=LDS2=KD
+* = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD
+* TRD_stage 2: = (2NB+1)*N + KD*NTHREADS
+* TRD_both : = max(stage1,stage2) + AB ( AB=(KD+1)*N )
+* = N*KD + N*max(KD+1,FACTOPTNB)
+* + max(2*KD*KD, KD*NTHREADS)
+* + (KD+1)*N
+ LWORK = -1
+ SUBNAM(1:1) = PREC
+ SUBNAM(2:6) = 'GEQRF'
+ QROPTNB = ILAENV( 1, SUBNAM, ' ', NI, NBI, -1, -1 )
+ SUBNAM(2:6) = 'GELQF'
+ LQOPTNB = ILAENV( 1, SUBNAM, ' ', NBI, NI, -1, -1 )
+* Could be QR or LQ for TRD and the max for BRD
+ FACTOPTNB = MAX(QROPTNB, LQOPTNB)
+ IF( ALGO.EQ.'TRD' ) THEN
+ IF( STAG.EQ.'2STAG' ) THEN
+ LWORK = NI*NBI + NI*MAX(NBI+1,FACTOPTNB)
+ $ + MAX(2*NBI*NBI, NBI*NTHREADS)
+ $ + (NBI+1)*NI
+ ELSE IF( (STAG.EQ.'HE2HB').OR.(STAG.EQ.'SY2SB') ) THEN
+ LWORK = NI*NBI + NI*MAX(NBI,FACTOPTNB) + 2*NBI*NBI
+ ELSE IF( (STAG.EQ.'HB2ST').OR.(STAG.EQ.'SB2ST') ) THEN
+ LWORK = (2*NBI+1)*NI + NBI*NTHREADS
+ ENDIF
+ ELSE IF( ALGO.EQ.'BRD' ) THEN
+ IF( STAG.EQ.'2STAG' ) THEN
+ LWORK = 2*NI*NBI + NI*MAX(NBI+1,FACTOPTNB)
+ $ + MAX(2*NBI*NBI, NBI*NTHREADS)
+ $ + (NBI+1)*NI
+ ELSE IF( STAG.EQ.'GE2GB' ) THEN
+ LWORK = NI*NBI + NI*MAX(NBI,FACTOPTNB) + 2*NBI*NBI
+ ELSE IF( STAG.EQ.'GB2BD' ) THEN
+ LWORK = (3*NBI+1)*NI + NBI*NTHREADS
+ ENDIF
+ ENDIF
+ LWORK = MAX ( 1, LWORK )
+
+ IF( LWORK.GT.0 ) THEN
+ IPARAM2STAGE = LWORK
+ ELSE
+ IPARAM2STAGE = -1
+ ENDIF
+ RETURN
+*
+ 21 CONTINUE
+*
+* ISPEC = 21 for future use
+ IPARAM2STAGE = NXI
+ RETURN
+*
+* ==== End of IPARAM2STAGE ====
+*
+ END
diff --git a/SRC/slarfy.f b/SRC/slarfy.f
new file mode 100644
index 00000000..19a7fa6d
--- /dev/null
+++ b/SRC/slarfy.f
@@ -0,0 +1,161 @@
+*> \brief \b SLARFY
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+* .. Scalar Arguments ..
+* CHARACTER UPLO
+* INTEGER INCV, LDC, N
+* REAL TAU
+* ..
+* .. Array Arguments ..
+* REAL C( LDC, * ), V( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SLARFY applies an elementary reflector, or Householder matrix, H,
+*> to an n x n symmetric matrix C, from both the left and the right.
+*>
+*> H is represented in the form
+*>
+*> H = I - tau * v * v'
+*>
+*> where tau is a scalar and v is a vector.
+*>
+*> If tau is zero, then H is taken to be the unit matrix.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> Specifies whether the upper or lower triangular part of the
+*> symmetric matrix C is stored.
+*> = 'U': Upper triangle
+*> = 'L': Lower triangle
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of rows and columns of the matrix C. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is REAL array, dimension
+*> (1 + (N-1)*abs(INCV))
+*> The vector v as described above.
+*> \endverbatim
+*>
+*> \param[in] INCV
+*> \verbatim
+*> INCV is INTEGER
+*> The increment between successive elements of v. INCV must
+*> not be zero.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*> TAU is REAL
+*> The value tau as described above.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is REAL array, dimension (LDC, N)
+*> On entry, the matrix C.
+*> On exit, C is overwritten by H * C * H'.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> The leading dimension of the array C. LDC >= max( 1, N ).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is REAL array, dimension (N)
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2011
+*
+*> \ingroup single_eig
+*
+* =====================================================================
+ SUBROUTINE SLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+* -- LAPACK test routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2011
+*
+* .. Scalar Arguments ..
+ CHARACTER UPLO
+ INTEGER INCV, LDC, N
+ REAL TAU
+* ..
+* .. Array Arguments ..
+ REAL C( LDC, * ), V( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ONE, ZERO, HALF
+ PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0, HALF = 0.5E+0 )
+* ..
+* .. Local Scalars ..
+ REAL ALPHA
+* ..
+* .. External Subroutines ..
+ EXTERNAL SAXPY, SSYMV, SSYR2
+* ..
+* .. External Functions ..
+ REAL SDOT
+ EXTERNAL SDOT
+* ..
+* .. Executable Statements ..
+*
+ IF( TAU.EQ.ZERO )
+ $ RETURN
+*
+* Form w:= C * v
+*
+ CALL SSYMV( UPLO, N, ONE, C, LDC, V, INCV, ZERO, WORK, 1 )
+*
+ ALPHA = -HALF*TAU*SDOT( N, WORK, 1, V, INCV )
+ CALL SAXPY( N, ALPHA, V, INCV, WORK, 1 )
+*
+* C := C - v * w' - w * v'
+*
+ CALL SSYR2( UPLO, N, -TAU, V, INCV, WORK, 1, C, LDC )
+*
+ RETURN
+*
+* End of SLARFY
+*
+ END
diff --git a/SRC/ssb2st_kernels.f b/SRC/ssb2st_kernels.f
new file mode 100644
index 00000000..60058dda
--- /dev/null
+++ b/SRC/ssb2st_kernels.f
@@ -0,0 +1,320 @@
+*> \brief \b SSB2ST_KERNELS
+*
+* @generated from zhb2st_kernels.f, fortran z -> s, Sun Nov 6 19:34:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSB2ST_KERNELS + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssb2st_kernels.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssb2st_kernels.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssb2st_kernels.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SSB2ST_KERNELS( UPLO, WANTZ, TTYPE,
+* ST, ED, SWEEP, N, NB, IB,
+* A, LDA, V, TAU, LDVT, WORK)
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER UPLO
+* LOGICAL WANTZ
+* INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+* ..
+* .. Array Arguments ..
+* REAL A( LDA, * ), V( * ),
+* TAU( * ), WORK( * )
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SSB2ST_KERNELS is an internal routine used by the SSYTRD_SB2ST
+*> subroutine.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> @param[in] n
+*> The order of the matrix A.
+*>
+*> @param[in] nb
+*> The size of the band.
+*>
+*> @param[in, out] A
+*> A pointer to the matrix A.
+*>
+*> @param[in] lda
+*> The leading dimension of the matrix A.
+*>
+*> @param[out] V
+*> REAL array, dimension 2*n if eigenvalues only are
+*> requested or to be queried for vectors.
+*>
+*> @param[out] TAU
+*> REAL array, dimension (2*n).
+*> The scalar factors of the Householder reflectors are stored
+*> in this array.
+*>
+*> @param[in] st
+*> internal parameter for indices.
+*>
+*> @param[in] ed
+*> internal parameter for indices.
+*>
+*> @param[in] sweep
+*> internal parameter for indices.
+*>
+*> @param[in] Vblksiz
+*> internal parameter for indices.
+*>
+*> @param[in] wantz
+*> logical which indicate if Eigenvalue are requested or both
+*> Eigenvalue/Eigenvectors.
+*>
+*> @param[in] work
+*> Workspace of size nb.
+*>
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE SSB2ST_KERNELS( UPLO, WANTZ, TTYPE,
+ $ ST, ED, SWEEP, N, NB, IB,
+ $ A, LDA, V, TAU, LDVT, WORK)
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER UPLO
+ LOGICAL WANTZ
+ INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+* ..
+* .. Array Arguments ..
+ REAL A( LDA, * ), V( * ),
+ $ TAU( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E+0,
+ $ ONE = 1.0E+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL UPPER
+ INTEGER I, J1, J2, LM, LN, VPOS, TAUPOS,
+ $ DPOS, OFDPOS, AJETER
+ REAL CTMP
+* ..
+* .. External Subroutines ..
+ EXTERNAL SLARFG, SLARFX, SLARFY
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MOD
+* .. External Functions ..
+ LOGICAL LSAME
+ EXTERNAL LSAME
+* ..
+* ..
+* .. Executable Statements ..
+*
+ AJETER = IB + LDVT
+ UPPER = LSAME( UPLO, 'U' )
+
+ IF( UPPER ) THEN
+ DPOS = 2 * NB + 1
+ OFDPOS = 2 * NB
+ ELSE
+ DPOS = 1
+ OFDPOS = 2
+ ENDIF
+
+*
+* Upper case
+*
+ IF( UPPER ) THEN
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ENDIF
+ GO TO ( 101, 102, 103 ) TTYPE
+*
+ 101 CONTINUE
+ LM = ED - ST + 1
+*
+ V( VPOS ) = ONE
+ DO 10 I = 1, LM-1
+ V( VPOS+I ) = ( A( OFDPOS-I, ST+I ) )
+ A( OFDPOS-I, ST+I ) = ZERO
+ 10 CONTINUE
+ CTMP = ( A( OFDPOS, ST ) )
+ CALL SLARFG( LM, CTMP, V( VPOS+1 ), 1,
+ $ TAU( TAUPOS ) )
+ A( OFDPOS, ST ) = CTMP
+*
+ 103 CONTINUE
+ LM = ED - ST + 1
+ CALL SLARFY( UPLO, LM, V( VPOS ), 1, ( TAU( TAUPOS ) ),
+ $ A( DPOS, ST ), LDA-1, WORK)
+ GOTO 300
+*
+ 102 CONTINUE
+ J1 = ED+1
+ J2 = MIN( ED+NB, N )
+ LN = ED-ST+1
+ LM = J2-J1+1
+ IF( LM.GT.0) THEN
+ CALL SLARFX( 'Left', LN, LM, V( VPOS ),
+ $ ( TAU( TAUPOS ) ), A( DPOS-NB, J1 ),
+ $ LDA-1, WORK)
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ENDIF
+*
+ V( VPOS ) = ONE
+ DO 30 I = 1, LM-1
+ V( VPOS+I ) = ( A( DPOS-NB-I, J1+I ) )
+ A( DPOS-NB-I, J1+I ) = ZERO
+ 30 CONTINUE
+ CTMP = ( A( DPOS-NB, J1 ) )
+ CALL SLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) )
+ A( DPOS-NB, J1 ) = CTMP
+*
+ CALL SLARFX( 'Right', LN-1, LM, V( VPOS ),
+ $ TAU( TAUPOS ),
+ $ A( DPOS-NB+1, J1 ), LDA-1, WORK)
+ ENDIF
+ GOTO 300
+*
+* Lower case
+*
+ ELSE
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ENDIF
+ GO TO ( 201, 202, 203 ) TTYPE
+*
+ 201 CONTINUE
+ LM = ED - ST + 1
+*
+ V( VPOS ) = ONE
+ DO 20 I = 1, LM-1
+ V( VPOS+I ) = A( OFDPOS+I, ST-1 )
+ A( OFDPOS+I, ST-1 ) = ZERO
+ 20 CONTINUE
+ CALL SLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1,
+ $ TAU( TAUPOS ) )
+*
+ 203 CONTINUE
+ LM = ED - ST + 1
+*
+ CALL SLARFY( UPLO, LM, V( VPOS ), 1, ( TAU( TAUPOS ) ),
+ $ A( DPOS, ST ), LDA-1, WORK)
+
+ GOTO 300
+*
+ 202 CONTINUE
+ J1 = ED+1
+ J2 = MIN( ED+NB, N )
+ LN = ED-ST+1
+ LM = J2-J1+1
+*
+ IF( LM.GT.0) THEN
+ CALL SLARFX( 'Right', LM, LN, V( VPOS ),
+ $ TAU( TAUPOS ), A( DPOS+NB, ST ),
+ $ LDA-1, WORK)
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ENDIF
+*
+ V( VPOS ) = ONE
+ DO 40 I = 1, LM-1
+ V( VPOS+I ) = A( DPOS+NB+I, ST )
+ A( DPOS+NB+I, ST ) = ZERO
+ 40 CONTINUE
+ CALL SLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1,
+ $ TAU( TAUPOS ) )
+*
+ CALL SLARFX( 'Left', LM, LN-1, V( VPOS ),
+ $ ( TAU( TAUPOS ) ),
+ $ A( DPOS+NB-1, ST+1 ), LDA-1, WORK)
+
+ ENDIF
+ GOTO 300
+ ENDIF
+
+ 300 CONTINUE
+ RETURN
+*
+* END OF SSB2ST_KERNELS
+*
+ END
diff --git a/SRC/ssbev_2stage.f b/SRC/ssbev_2stage.f
new file mode 100644
index 00000000..821c00a3
--- /dev/null
+++ b/SRC/ssbev_2stage.f
@@ -0,0 +1,377 @@
+*> \brief <b> SSBEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+* @generated from dsbev_2stage.f, fortran d -> s, Sat Nov 5 23:58:09 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSBEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssbev_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssbev_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssbev_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SSBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+* WORK, LWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, KD, LDAB, LDZ, N, LWORK
+* ..
+* .. Array Arguments ..
+* REAL AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SSBEV_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a real symmetric band matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is REAL array, dimension (LDAB, N)
+*> On entry, the upper or lower triangle of the symmetric band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*> On exit, AB is overwritten by values generated during the
+*> reduction to tridiagonal form. If UPLO = 'U', the first
+*> superdiagonal and the diagonal of the tridiagonal matrix T
+*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*> the diagonal and first subdiagonal of T are returned in the
+*> first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is REAL array, dimension (LDZ, N)
+*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*> eigenvectors of the matrix A, with the i-th column of Z
+*> holding the eigenvector associated with W(i).
+*> If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is REAL array, dimension LWORK
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS + N
+*> where KD is the size of the band.
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, the algorithm failed to converge; i
+*> off-diagonal elements of an intermediate tridiagonal
+*> form did not converge to zero.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup realOTHEReigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE SSBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+ $ WORK, LWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, KD, LDAB, LDZ, N, LWORK
+* ..
+* .. Array Arguments ..
+ REAL AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, WANTZ, LQUERY
+ INTEGER IINFO, IMAX, INDE, INDWRK, ISCALE,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS
+ REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, SLANSB
+ EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL SLASCL, SSCAL, SSTEQR, SSTERF, XERBLA
+ $ SSYTRD_SB2ST
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDAB.LT.KD+1 ) THEN
+ INFO = -6
+ ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -9
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ WORK( 1 ) = LWMIN
+ ELSE
+ IB = ILAENV( 18, 'SSYTRD_SB2ST', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+ LWMIN = N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+ ENDIF
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -11
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSBEV_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ W( 1 ) = AB( 1, 1 )
+ ELSE
+ W( 1 ) = AB( KD+1, 1 )
+ END IF
+ IF( WANTZ )
+ $ Z( 1, 1 ) = ONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = SLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ CALL SLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ ELSE
+ CALL SLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ END IF
+ END IF
+*
+* Call SSYTRD_SB2ST to reduce symmetric band matrix to tridiagonal form.
+*
+ INDE = 1
+ INDHOUS = INDE + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+ $ WORK( INDE ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call SSTERF. For eigenvectors, call SSTEQR.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL SSTERF( N, W, WORK( INDE ), INFO )
+ ELSE
+ CALL SSTEQR( JOBZ, N, W, WORK( INDE ), Z, LDZ, WORK( INDWRK ),
+ $ INFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = N
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of SSBEV_2STAGE
+*
+ END
diff --git a/SRC/ssbevd_2stage.f b/SRC/ssbevd_2stage.f
new file mode 100644
index 00000000..8a306306
--- /dev/null
+++ b/SRC/ssbevd_2stage.f
@@ -0,0 +1,412 @@
+*> \brief <b> SSBEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+* @generated from dsbevd_2stage.f, fortran d -> s, Sat Nov 5 23:58:03 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSBEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssbevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssbevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssbevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SSBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+* WORK, LWORK, IWORK, LIWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, KD, LDAB, LDZ, LIWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+* INTEGER IWORK( * )
+* REAL AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SSBEVD_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a real symmetric band matrix A using the 2stage technique for
+*> the reduction to tridiagonal. If eigenvectors are desired, it uses
+*> a divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is REAL array, dimension (LDAB, N)
+*> On entry, the upper or lower triangle of the symmetric band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*> On exit, AB is overwritten by values generated during the
+*> reduction to tridiagonal form. If UPLO = 'U', the first
+*> superdiagonal and the diagonal of the tridiagonal matrix T
+*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*> the diagonal and first subdiagonal of T are returned in the
+*> first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is REAL array, dimension (LDZ, N)
+*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*> eigenvectors of the matrix A, with the i-th column of Z
+*> holding the eigenvector associated with W(i).
+*> If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is REAL array, dimension LWORK
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS + N
+*> where KD is the size of the band.
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK and IWORK
+*> arrays, returns these values as the first entries of the WORK
+*> and IWORK arrays, and no error message related to LWORK or
+*> LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*> LIWORK is INTEGER
+*> The dimension of the array IWORK.
+*> If JOBZ = 'N' or N <= 1, LIWORK must be at least 1.
+*> If JOBZ = 'V' and N > 2, LIWORK must be at least 3 + 5*N.
+*>
+*> If LIWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK and IWORK arrays, and no error message related to
+*> LWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, the algorithm failed to converge; i
+*> off-diagonal elements of an intermediate tridiagonal
+*> form did not converge to zero.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup realOTHEReigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE SSBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+ $ WORK, LWORK, IWORK, LIWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, KD, LDAB, LDZ, LIWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+ INTEGER IWORK( * )
+ REAL AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, LQUERY, WANTZ
+ INTEGER IINFO, INDE, INDWK2, INDWRK, ISCALE, LIWMIN,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS,
+ $ LLWRK2
+ REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, SLANSB
+ EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL SGEMM, SLACPY, SLASCL, SSCAL, SSTEDC,
+ $ SSTERF, XERBLA, SSYTRD_SB2ST
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( N.LE.1 ) THEN
+ LIWMIN = 1
+ LWMIN = 1
+ ELSE
+ IB = ILAENV( 18, 'SSYTRD_SB2ST', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+ IF( WANTZ ) THEN
+ LIWMIN = 3 + 5*N
+ LWMIN = 1 + 5*N + 2*N**2
+ ELSE
+ LIWMIN = 1
+ LWMIN = MAX( 2*N, N+LHTRD+LWTRD )
+ END IF
+ END IF
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDAB.LT.KD+1 ) THEN
+ INFO = -6
+ ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -9
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ WORK( 1 ) = LWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -11
+ ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -13
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSBEVD_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ W( 1 ) = AB( 1, 1 )
+ IF( WANTZ )
+ $ Z( 1, 1 ) = ONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = SLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ CALL SLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ ELSE
+ CALL SLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ END IF
+ END IF
+*
+* Call SSYTRD_SB2ST to reduce band symmetric matrix to tridiagonal form.
+*
+ INDE = 1
+ INDHOUS = INDE + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+ INDWK2 = INDWRK + N*N
+ LLWRK2 = LWORK - INDWK2 + 1
+*
+ CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+ $ WORK( INDE ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call SSTERF. For eigenvectors, call SSTEDC.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL SSTERF( N, W, WORK( INDE ), INFO )
+ ELSE
+ CALL SSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N,
+ $ WORK( INDWK2 ), LLWRK2, IWORK, LIWORK, INFO )
+ CALL SGEMM( 'N', 'N', N, N, N, ONE, Z, LDZ, WORK( INDWRK ), N,
+ $ ZERO, WORK( INDWK2 ), N )
+ CALL SLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 )
+ $ CALL SSCAL( N, ONE / SIGMA, W, 1 )
+*
+ WORK( 1 ) = LWMIN
+ IWORK( 1 ) = LIWMIN
+ RETURN
+*
+* End of SSBEVD_2STAGE
+*
+ END
diff --git a/SRC/ssbevx_2stage.f b/SRC/ssbevx_2stage.f
new file mode 100644
index 00000000..d3a588c4
--- /dev/null
+++ b/SRC/ssbevx_2stage.f
@@ -0,0 +1,633 @@
+*> \brief <b> SSBEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+* @generated from dsbevx_2stage.f, fortran d -> s, Sat Nov 5 23:58:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSBEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssbevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssbevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssbevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SSBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, Q,
+* LDQ, VL, VU, IL, IU, ABSTOL, M, W, Z,
+* LDZ, WORK, LWORK, IWORK, IFAIL, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, RANGE, UPLO
+* INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+* REAL ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+* INTEGER IFAIL( * ), IWORK( * )
+* REAL AB( LDAB, * ), Q( LDQ, * ), W( * ), WORK( * ),
+* $ Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SSBEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a real symmetric band matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors can
+*> be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*> RANGE is CHARACTER*1
+*> = 'A': all eigenvalues will be found;
+*> = 'V': all eigenvalues in the half-open interval (VL,VU]
+*> will be found;
+*> = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is REAL array, dimension (LDAB, N)
+*> On entry, the upper or lower triangle of the symmetric band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*> On exit, AB is overwritten by values generated during the
+*> reduction to tridiagonal form. If UPLO = 'U', the first
+*> superdiagonal and the diagonal of the tridiagonal matrix T
+*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*> the diagonal and first subdiagonal of T are returned in the
+*> first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] Q
+*> \verbatim
+*> Q is REAL array, dimension (LDQ, N)
+*> If JOBZ = 'V', the N-by-N orthogonal matrix used in the
+*> reduction to tridiagonal form.
+*> If JOBZ = 'N', the array Q is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDQ
+*> \verbatim
+*> LDQ is INTEGER
+*> The leading dimension of the array Q. If JOBZ = 'V', then
+*> LDQ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*> VL is REAL
+*> If RANGE='V', the lower bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*> VU is REAL
+*> If RANGE='V', the upper bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*> IL is INTEGER
+*> If RANGE='I', the index of the
+*> smallest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*> IU is INTEGER
+*> If RANGE='I', the index of the
+*> largest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*> ABSTOL is REAL
+*> The absolute error tolerance for the eigenvalues.
+*> An approximate eigenvalue is accepted as converged
+*> when it is determined to lie in an interval [a,b]
+*> of width less than or equal to
+*>
+*> ABSTOL + EPS * max( |a|,|b| ) ,
+*>
+*> where EPS is the machine precision. If ABSTOL is less than
+*> or equal to zero, then EPS*|T| will be used in its place,
+*> where |T| is the 1-norm of the tridiagonal matrix obtained
+*> by reducing AB to tridiagonal form.
+*>
+*> Eigenvalues will be computed most accurately when ABSTOL is
+*> set to twice the underflow threshold 2*SLAMCH('S'), not zero.
+*> If this routine returns with INFO>0, indicating that some
+*> eigenvectors did not converge, try setting ABSTOL to
+*> 2*SLAMCH('S').
+*>
+*> See "Computing Small Singular Values of Bidiagonal Matrices
+*> with Guaranteed High Relative Accuracy," by Demmel and
+*> Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*> M is INTEGER
+*> The total number of eigenvalues found. 0 <= M <= N.
+*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> The first M elements contain the selected eigenvalues in
+*> ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is REAL array, dimension (LDZ, max(1,M))
+*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*> contain the orthonormal eigenvectors of the matrix A
+*> corresponding to the selected eigenvalues, with the i-th
+*> column of Z holding the eigenvector associated with W(i).
+*> If an eigenvector fails to converge, then that column of Z
+*> contains the latest approximation to the eigenvector, and the
+*> index of the eigenvector is returned in IFAIL.
+*> If JOBZ = 'N', then Z is not referenced.
+*> Note: the user must ensure that at least max(1,M) columns are
+*> supplied in the array Z; if RANGE = 'V', the exact value of M
+*> is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is REAL array, dimension (LWORK)
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, 7*N, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS + 2*N
+*> where KD is the size of the band.
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*> IFAIL is INTEGER array, dimension (N)
+*> If JOBZ = 'V', then if INFO = 0, the first M elements of
+*> IFAIL are zero. If INFO > 0, then IFAIL contains the
+*> indices of the eigenvectors that failed to converge.
+*> If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit.
+*> < 0: if INFO = -i, the i-th argument had an illegal value.
+*> > 0: if INFO = i, then i eigenvectors failed to converge.
+*> Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup realOTHEReigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE SSBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, Q,
+ $ LDQ, VL, VU, IL, IU, ABSTOL, M, W, Z,
+ $ LDZ, WORK, LWORK, IWORK, IFAIL, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, RANGE, UPLO
+ INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+ REAL ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+ INTEGER IFAIL( * ), IWORK( * )
+ REAL AB( LDAB, * ), Q( LDQ, * ), W( * ), WORK( * ),
+ $ Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL ALLEIG, INDEIG, LOWER, TEST, VALEIG, WANTZ,
+ $ LQUERY
+ CHARACTER ORDER
+ INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+ $ INDISP, INDIWO, INDWRK, ISCALE, ITMP1, J, JJ,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS,
+ $ NSPLIT
+ REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+ $ SIGMA, SMLNUM, TMP1, VLL, VUU
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, SLANSB
+ EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL SCOPY, SGEMV, SLACPY, SLASCL, SSCAL,
+ $ SSTEBZ, SSTEIN, SSTEQR, SSTERF, SSWAP, XERBLA,
+ $ SSYTRD_SB2ST
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, MIN, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ ALLEIG = LSAME( RANGE, 'A' )
+ VALEIG = LSAME( RANGE, 'V' )
+ INDEIG = LSAME( RANGE, 'I' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -5
+ ELSE IF( LDAB.LT.KD+1 ) THEN
+ INFO = -7
+ ELSE IF( WANTZ .AND. LDQ.LT.MAX( 1, N ) ) THEN
+ INFO = -9
+ ELSE
+ IF( VALEIG ) THEN
+ IF( N.GT.0 .AND. VU.LE.VL )
+ $ INFO = -11
+ ELSE IF( INDEIG ) THEN
+ IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+ INFO = -12
+ ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+ INFO = -13
+ END IF
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) )
+ $ INFO = -18
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ WORK( 1 ) = LWMIN
+ ELSE
+ IB = ILAENV( 18, 'SSYTRD_SB2ST', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+ LWMIN = 2*N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+ ENDIF
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -20
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSBEVX_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ M = 0
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ M = 1
+ IF( LOWER ) THEN
+ TMP1 = AB( 1, 1 )
+ ELSE
+ TMP1 = AB( KD+1, 1 )
+ END IF
+ IF( VALEIG ) THEN
+ IF( .NOT.( VL.LT.TMP1 .AND. VU.GE.TMP1 ) )
+ $ M = 0
+ END IF
+ IF( M.EQ.1 ) THEN
+ W( 1 ) = TMP1
+ IF( WANTZ )
+ $ Z( 1, 1 ) = ONE
+ END IF
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ISCALE = 0
+ ABSTLL = ABSTOL
+ IF( VALEIG ) THEN
+ VLL = VL
+ VUU = VU
+ ELSE
+ VLL = ZERO
+ VUU = ZERO
+ END IF
+ ANRM = SLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK )
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ CALL SLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ ELSE
+ CALL SLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ END IF
+ IF( ABSTOL.GT.0 )
+ $ ABSTLL = ABSTOL*SIGMA
+ IF( VALEIG ) THEN
+ VLL = VL*SIGMA
+ VUU = VU*SIGMA
+ END IF
+ END IF
+*
+* Call SSYTRD_SB2ST to reduce symmetric band matrix to tridiagonal form.
+*
+ INDD = 1
+ INDE = INDD + N
+ INDHOUS = INDE + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, WORK( INDD ),
+ $ WORK( INDE ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* If all eigenvalues are desired and ABSTOL is less than or equal
+* to zero, then call SSTERF or SSTEQR. If this fails for some
+* eigenvalue, then try SSTEBZ.
+*
+ TEST = .FALSE.
+ IF (INDEIG) THEN
+ IF (IL.EQ.1 .AND. IU.EQ.N) THEN
+ TEST = .TRUE.
+ END IF
+ END IF
+ IF ((ALLEIG .OR. TEST) .AND. (ABSTOL.LE.ZERO)) THEN
+ CALL SCOPY( N, WORK( INDD ), 1, W, 1 )
+ INDEE = INDWRK + 2*N
+ IF( .NOT.WANTZ ) THEN
+ CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+ CALL SSTERF( N, W, WORK( INDEE ), INFO )
+ ELSE
+ CALL SLACPY( 'A', N, N, Q, LDQ, Z, LDZ )
+ CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+ CALL SSTEQR( JOBZ, N, W, WORK( INDEE ), Z, LDZ,
+ $ WORK( INDWRK ), INFO )
+ IF( INFO.EQ.0 ) THEN
+ DO 10 I = 1, N
+ IFAIL( I ) = 0
+ 10 CONTINUE
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ M = N
+ GO TO 30
+ END IF
+ INFO = 0
+ END IF
+*
+* Otherwise, call SSTEBZ and, if eigenvectors are desired, SSTEIN.
+*
+ IF( WANTZ ) THEN
+ ORDER = 'B'
+ ELSE
+ ORDER = 'E'
+ END IF
+ INDIBL = 1
+ INDISP = INDIBL + N
+ INDIWO = INDISP + N
+ CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+ $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ),
+ $ IWORK( INDIWO ), INFO )
+*
+ IF( WANTZ ) THEN
+ CALL SSTEIN( N, WORK( INDD ), WORK( INDE ), M, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+ $ WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO )
+*
+* Apply orthogonal matrix used in reduction to tridiagonal
+* form to eigenvectors returned by SSTEIN.
+*
+ DO 20 J = 1, M
+ CALL SCOPY( N, Z( 1, J ), 1, WORK( 1 ), 1 )
+ CALL SGEMV( 'N', N, N, ONE, Q, LDQ, WORK, 1, ZERO,
+ $ Z( 1, J ), 1 )
+ 20 CONTINUE
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ 30 CONTINUE
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = M
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* If eigenvalues are not in order, then sort them, along with
+* eigenvectors.
+*
+ IF( WANTZ ) THEN
+ DO 50 J = 1, M - 1
+ I = 0
+ TMP1 = W( J )
+ DO 40 JJ = J + 1, M
+ IF( W( JJ ).LT.TMP1 ) THEN
+ I = JJ
+ TMP1 = W( JJ )
+ END IF
+ 40 CONTINUE
+*
+ IF( I.NE.0 ) THEN
+ ITMP1 = IWORK( INDIBL+I-1 )
+ W( I ) = W( J )
+ IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+ W( J ) = TMP1
+ IWORK( INDIBL+J-1 ) = ITMP1
+ CALL SSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+ IF( INFO.NE.0 ) THEN
+ ITMP1 = IFAIL( I )
+ IFAIL( I ) = IFAIL( J )
+ IFAIL( J ) = ITMP1
+ END IF
+ END IF
+ 50 CONTINUE
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of SSBEVX_2STAGE
+*
+ END
diff --git a/SRC/ssyev_2stage.f b/SRC/ssyev_2stage.f
new file mode 100644
index 00000000..52f11c35
--- /dev/null
+++ b/SRC/ssyev_2stage.f
@@ -0,0 +1,348 @@
+*> \brief <b> SSYEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+* @generated from dsyev_2stage.f, fortran d -> s, Sat Nov 5 23:55:51 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSYEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssyevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssyevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssyevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+* INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, LDA, LWORK, N
+* ..
+* .. Array Arguments ..
+* REAL A( LDA, * ), W( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SSYEV_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is REAL array, dimension (LDA, N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*> orthonormal eigenvectors of the matrix A.
+*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*> or the upper triangle (if UPLO='U') of A, including the
+*> diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is REAL array, dimension LWORK
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + 2*N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + 2*N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, the algorithm failed to converge; i
+*> off-diagonal elements of an intermediate tridiagonal
+*> form did not converge to zero.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup realSYeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+ $ INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, LDA, LWORK, N
+* ..
+* .. Array Arguments ..
+ REAL A( LDA, * ), W( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, LQUERY, WANTZ
+ INTEGER IINFO, IMAX, INDE, INDTAU, INDWRK, ISCALE,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+ REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, SLANSY
+ EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY
+* ..
+* .. External Subroutines ..
+ EXTERNAL SLASCL, SORGTR, SSCAL, SSTEQR, SSTERF,
+ $ XERBLA, SSYTRD_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ KD = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = 2*N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -8
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSYEV_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 ) THEN
+ RETURN
+ END IF
+*
+ IF( N.EQ.1 ) THEN
+ W( 1 ) = A( 1, 1 )
+ WORK( 1 ) = 2
+ IF( WANTZ )
+ $ A( 1, 1 ) = ONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = SLANSY( 'M', UPLO, N, A, LDA, WORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 )
+ $ CALL SLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+* Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+ INDE = 1
+ INDTAU = INDE + N
+ INDHOUS = INDTAU + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
+ $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call SSTERF. For eigenvectors, first call
+* SORGTR to generate the orthogonal matrix, then call SSTEQR.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL SSTERF( N, W, WORK( INDE ), INFO )
+ ELSE
+* Not available in this release, and agrument checking should not
+* let it getting here
+ RETURN
+ CALL SORGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ),
+ $ LLWORK, IINFO )
+ CALL SSTEQR( JOBZ, N, W, WORK( INDE ), A, LDA, WORK( INDTAU ),
+ $ INFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = N
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of SSYEV_2STAGE
+*
+ END
diff --git a/SRC/ssyevd_2stage.f b/SRC/ssyevd_2stage.f
new file mode 100644
index 00000000..8510b645
--- /dev/null
+++ b/SRC/ssyevd_2stage.f
@@ -0,0 +1,406 @@
+*> \brief <b> SSYEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+* @generated from dsyevd_2stage.f, fortran d -> s, Sat Nov 5 23:55:54 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSYEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssyevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssyevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssyevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SSYEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+* IWORK, LIWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, LDA, LIWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+* INTEGER IWORK( * )
+* REAL A( LDA, * ), W( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SSYEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal. If eigenvectors are desired, it uses a
+*> divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is REAL array, dimension (LDA, N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*> orthonormal eigenvectors of the matrix A.
+*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*> or the upper triangle (if UPLO='U') of A, including the
+*> diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is REAL array,
+*> dimension (LWORK)
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK.
+*> If N <= 1, LWORK must be at least 1.
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + 2*N+1
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + 2*N+1
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be at least
+*> 1 + 6*N + 2*N**2.
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK and IWORK
+*> arrays, returns these values as the first entries of the WORK
+*> and IWORK arrays, and no error message related to LWORK or
+*> LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*> LIWORK is INTEGER
+*> The dimension of the array IWORK.
+*> If N <= 1, LIWORK must be at least 1.
+*> If JOBZ = 'N' and N > 1, LIWORK must be at least 1.
+*> If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
+*>
+*> If LIWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK and IWORK arrays, and no error message related to
+*> LWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i and JOBZ = 'N', then the algorithm failed
+*> to converge; i off-diagonal elements of an intermediate
+*> tridiagonal form did not converge to zero;
+*> if INFO = i and JOBZ = 'V', then the algorithm failed
+*> to compute an eigenvalue while working on the submatrix
+*> lying in rows and columns INFO/(N+1) through
+*> mod(INFO,N+1).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date September 2012
+*
+*> \ingroup realSYeigen
+*
+*> \par Contributors:
+* ==================
+*>
+*> Jeff Rutter, Computer Science Division, University of California
+*> at Berkeley, USA \n
+*> Modified by Francoise Tisseur, University of Tennessee \n
+*> Modified description of INFO. Sven, 16 Feb 05. \n
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE SSYEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+ $ IWORK, LIWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.4.2) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* September 2012
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, LDA, LIWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+ INTEGER IWORK( * )
+ REAL A( LDA, * ), W( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+* ..
+* .. Local Scalars ..
+*
+ LOGICAL LOWER, LQUERY, WANTZ
+ INTEGER IINFO, INDE, INDTAU, INDWK2, INDWRK, ISCALE,
+ $ LIWMIN, LLWORK, LLWRK2, LWMIN,
+ $ LHTRD, LWTRD, KD, IB, INDHOUS
+ REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, SLANSY
+ EXTERNAL LSAME, SLAMCH, SLANSY, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL SLACPY, SLASCL, SORMTR, SSCAL, SSTEDC, SSTERF,
+ $ SSYTRD_2STAGE, XERBLA
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LIWMIN = 1
+ LWMIN = 1
+ ELSE
+ KD = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ IF( WANTZ ) THEN
+ LIWMIN = 3 + 5*N
+ LWMIN = 1 + 6*N + 2*N**2
+ ELSE
+ LIWMIN = 1
+ LWMIN = 2*N + 1 + LHTRD + LWTRD
+ END IF
+ END IF
+ WORK( 1 ) = LWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -8
+ ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -10
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSYEVD_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ W( 1 ) = A( 1, 1 )
+ IF( WANTZ )
+ $ A( 1, 1 ) = ONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = SLANSY( 'M', UPLO, N, A, LDA, WORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 )
+ $ CALL SLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+* Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+ INDE = 1
+ INDTAU = INDE + N
+ INDHOUS = INDTAU + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+ INDWK2 = INDWRK + N*N
+ LLWRK2 = LWORK - INDWK2 + 1
+*
+ CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
+ $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call SSTERF. For eigenvectors, first call
+* SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
+* tridiagonal matrix, then call SORMTR to multiply it by the
+* Householder transformations stored in A.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL SSTERF( N, W, WORK( INDE ), INFO )
+ ELSE
+* Not available in this release, and agrument checking should not
+* let it getting here
+ RETURN
+ CALL SSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N,
+ $ WORK( INDWK2 ), LLWRK2, IWORK, LIWORK, INFO )
+ CALL SORMTR( 'L', UPLO, 'N', N, N, A, LDA, WORK( INDTAU ),
+ $ WORK( INDWRK ), N, WORK( INDWK2 ), LLWRK2, IINFO )
+ CALL SLACPY( 'A', N, N, WORK( INDWRK ), N, A, LDA )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 )
+ $ CALL SSCAL( N, ONE / SIGMA, W, 1 )
+*
+ WORK( 1 ) = LWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ RETURN
+*
+* End of SSYEVD_2STAGE
+*
+ END
diff --git a/SRC/ssyevr_2stage.f b/SRC/ssyevr_2stage.f
new file mode 100644
index 00000000..27b99303
--- /dev/null
+++ b/SRC/ssyevr_2stage.f
@@ -0,0 +1,745 @@
+*> \brief <b> SSYEVR_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+* @generated from dsyevr_2stage.f, fortran d -> s, Sat Nov 5 23:50:10 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSYEVR_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssyevr_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssyevr_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssyevr_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SSYEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+* IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, WORK,
+* LWORK, IWORK, LIWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, RANGE, UPLO
+* INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LWORK, M, N
+* REAL ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+* INTEGER ISUPPZ( * ), IWORK( * )
+* REAL A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SSYEVR_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors can be
+*> selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*>
+*> SSYEVR_2STAGE first reduces the matrix A to tridiagonal form T with a call
+*> to SSYTRD. Then, whenever possible, SSYEVR_2STAGE calls SSTEMR to compute
+*> the eigenspectrum using Relatively Robust Representations. SSTEMR
+*> computes eigenvalues by the dqds algorithm, while orthogonal
+*> eigenvectors are computed from various "good" L D L^T representations
+*> (also known as Relatively Robust Representations). Gram-Schmidt
+*> orthogonalization is avoided as far as possible. More specifically,
+*> the various steps of the algorithm are as follows.
+*>
+*> For each unreduced block (submatrix) of T,
+*> (a) Compute T - sigma I = L D L^T, so that L and D
+*> define all the wanted eigenvalues to high relative accuracy.
+*> This means that small relative changes in the entries of D and L
+*> cause only small relative changes in the eigenvalues and
+*> eigenvectors. The standard (unfactored) representation of the
+*> tridiagonal matrix T does not have this property in general.
+*> (b) Compute the eigenvalues to suitable accuracy.
+*> If the eigenvectors are desired, the algorithm attains full
+*> accuracy of the computed eigenvalues only right before
+*> the corresponding vectors have to be computed, see steps c) and d).
+*> (c) For each cluster of close eigenvalues, select a new
+*> shift close to the cluster, find a new factorization, and refine
+*> the shifted eigenvalues to suitable accuracy.
+*> (d) For each eigenvalue with a large enough relative separation compute
+*> the corresponding eigenvector by forming a rank revealing twisted
+*> factorization. Go back to (c) for any clusters that remain.
+*>
+*> The desired accuracy of the output can be specified by the input
+*> parameter ABSTOL.
+*>
+*> For more details, see SSTEMR's documentation and:
+*> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+*> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+*> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+*> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+*> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+*> 2004. Also LAPACK Working Note 154.
+*> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+*> tridiagonal eigenvalue/eigenvector problem",
+*> Computer Science Division Technical Report No. UCB/CSD-97-971,
+*> UC Berkeley, May 1997.
+*>
+*>
+*> Note 1 : SSYEVR_2STAGE calls SSTEMR when the full spectrum is requested
+*> on machines which conform to the ieee-754 floating point standard.
+*> SSYEVR_2STAGE calls SSTEBZ and SSTEIN on non-ieee machines and
+*> when partial spectrum requests are made.
+*>
+*> Normal execution of SSTEMR may create NaNs and infinities and
+*> hence may abort due to a floating point exception in environments
+*> which do not handle NaNs and infinities in the ieee standard default
+*> manner.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*> RANGE is CHARACTER*1
+*> = 'A': all eigenvalues will be found.
+*> = 'V': all eigenvalues in the half-open interval (VL,VU]
+*> will be found.
+*> = 'I': the IL-th through IU-th eigenvalues will be found.
+*> For RANGE = 'V' or 'I' and IU - IL < N - 1, SSTEBZ and
+*> SSTEIN are called
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is REAL array, dimension (LDA, N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, the lower triangle (if UPLO='L') or the upper
+*> triangle (if UPLO='U') of A, including the diagonal, is
+*> destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*> VL is REAL
+*> If RANGE='V', the lower bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*> VU is REAL
+*> If RANGE='V', the upper bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*> IL is INTEGER
+*> If RANGE='I', the index of the
+*> smallest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*> IU is INTEGER
+*> If RANGE='I', the index of the
+*> largest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*> ABSTOL is REAL
+*> The absolute error tolerance for the eigenvalues.
+*> An approximate eigenvalue is accepted as converged
+*> when it is determined to lie in an interval [a,b]
+*> of width less than or equal to
+*>
+*> ABSTOL + EPS * max( |a|,|b| ) ,
+*>
+*> where EPS is the machine precision. If ABSTOL is less than
+*> or equal to zero, then EPS*|T| will be used in its place,
+*> where |T| is the 1-norm of the tridiagonal matrix obtained
+*> by reducing A to tridiagonal form.
+*>
+*> See "Computing Small Singular Values of Bidiagonal Matrices
+*> with Guaranteed High Relative Accuracy," by Demmel and
+*> Kahan, LAPACK Working Note #3.
+*>
+*> If high relative accuracy is important, set ABSTOL to
+*> SLAMCH( 'Safe minimum' ). Doing so will guarantee that
+*> eigenvalues are computed to high relative accuracy when
+*> possible in future releases. The current code does not
+*> make any guarantees about high relative accuracy, but
+*> future releases will. See J. Barlow and J. Demmel,
+*> "Computing Accurate Eigensystems of Scaled Diagonally
+*> Dominant Matrices", LAPACK Working Note #7, for a discussion
+*> of which matrices define their eigenvalues to high relative
+*> accuracy.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*> M is INTEGER
+*> The total number of eigenvalues found. 0 <= M <= N.
+*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> The first M elements contain the selected eigenvalues in
+*> ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is REAL array, dimension (LDZ, max(1,M))
+*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*> contain the orthonormal eigenvectors of the matrix A
+*> corresponding to the selected eigenvalues, with the i-th
+*> column of Z holding the eigenvector associated with W(i).
+*> If JOBZ = 'N', then Z is not referenced.
+*> Note: the user must ensure that at least max(1,M) columns are
+*> supplied in the array Z; if RANGE = 'V', the exact value of M
+*> is not known in advance and an upper bound must be used.
+*> Supplying N columns is always safe.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] ISUPPZ
+*> \verbatim
+*> ISUPPZ is INTEGER array, dimension ( 2*max(1,M) )
+*> The support of the eigenvectors in Z, i.e., the indices
+*> indicating the nonzero elements in Z. The i-th eigenvector
+*> is nonzero only in elements ISUPPZ( 2*i-1 ) through
+*> ISUPPZ( 2*i ). This is an output of SSTEMR (tridiagonal
+*> matrix). The support of the eigenvectors of A is typically
+*> 1:N because of the orthogonal transformations applied by SORMTR.
+*> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is REAL array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK.
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, 26*N, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + 5*N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + 5*N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*> LIWORK is INTEGER
+*> The dimension of the array IWORK. LIWORK >= max(1,10*N).
+*>
+*> If LIWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal size of the IWORK array,
+*> returns this value as the first entry of the IWORK array, and
+*> no error message related to LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: Internal error
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup realSYeigen
+*
+*> \par Contributors:
+* ==================
+*>
+*> Inderjit Dhillon, IBM Almaden, USA \n
+*> Osni Marques, LBNL/NERSC, USA \n
+*> Ken Stanley, Computer Science Division, University of
+*> California at Berkeley, USA \n
+*> Jason Riedy, Computer Science Division, University of
+*> California at Berkeley, USA \n
+*>
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE SSYEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+ $ IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, WORK,
+ $ LWORK, IWORK, LIWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, RANGE, UPLO
+ INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LWORK, M, N
+ REAL ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+ INTEGER ISUPPZ( * ), IWORK( * )
+ REAL A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE, TWO
+ PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, VALEIG, WANTZ,
+ $ TRYRAC, TEST
+ CHARACTER ORDER
+ INTEGER I, IEEEOK, IINFO, IMAX, INDD, INDDD, INDE,
+ $ INDEE, INDIBL, INDIFL, INDISP, INDIWO, INDTAU,
+ $ INDWK, INDWKN, ISCALE, J, JJ, LIWMIN,
+ $ LLWORK, LLWRKN, LWMIN, NSPLIT,
+ $ LHTRD, LWTRD, KD, IB, INDHOUS
+ REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+ $ SIGMA, SMLNUM, TMP1, VLL, VUU
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, SLANSY
+ EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY
+* ..
+* .. External Subroutines ..
+ EXTERNAL SCOPY, SORMTR, SSCAL, SSTEBZ, SSTEMR, SSTEIN,
+ $ SSTERF, SSWAP, SSYTRD_2STAGE, XERBLA
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, MIN, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ IEEEOK = ILAENV( 10, 'SSYEVR', 'N', 1, 2, 3, 4 )
+*
+ LOWER = LSAME( UPLO, 'L' )
+ WANTZ = LSAME( JOBZ, 'V' )
+ ALLEIG = LSAME( RANGE, 'A' )
+ VALEIG = LSAME( RANGE, 'V' )
+ INDEIG = LSAME( RANGE, 'I' )
+*
+ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) )
+*
+ KD = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD )
+ LIWMIN = MAX( 1, 10*N )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -6
+ ELSE
+ IF( VALEIG ) THEN
+ IF( N.GT.0 .AND. VU.LE.VL )
+ $ INFO = -8
+ ELSE IF( INDEIG ) THEN
+ IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+ INFO = -9
+ ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+ INFO = -10
+ END IF
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -15
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -18
+ ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -20
+ END IF
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+* NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
+* NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
+* LWKOPT = MAX( ( NB+1 )*N, LWMIN )
+ WORK( 1 ) = LWMIN
+ IWORK( 1 ) = LIWMIN
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSYEVR_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ M = 0
+ IF( N.EQ.0 ) THEN
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+ IF( N.EQ.1 ) THEN
+ WORK( 1 ) = 26
+ IF( ALLEIG .OR. INDEIG ) THEN
+ M = 1
+ W( 1 ) = A( 1, 1 )
+ ELSE
+ IF( VL.LT.A( 1, 1 ) .AND. VU.GE.A( 1, 1 ) ) THEN
+ M = 1
+ W( 1 ) = A( 1, 1 )
+ END IF
+ END IF
+ IF( WANTZ ) THEN
+ Z( 1, 1 ) = ONE
+ ISUPPZ( 1 ) = 1
+ ISUPPZ( 2 ) = 1
+ END IF
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ISCALE = 0
+ ABSTLL = ABSTOL
+ IF (VALEIG) THEN
+ VLL = VL
+ VUU = VU
+ END IF
+ ANRM = SLANSY( 'M', UPLO, N, A, LDA, WORK )
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ DO 10 J = 1, N
+ CALL SSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+ 10 CONTINUE
+ ELSE
+ DO 20 J = 1, N
+ CALL SSCAL( J, SIGMA, A( 1, J ), 1 )
+ 20 CONTINUE
+ END IF
+ IF( ABSTOL.GT.0 )
+ $ ABSTLL = ABSTOL*SIGMA
+ IF( VALEIG ) THEN
+ VLL = VL*SIGMA
+ VUU = VU*SIGMA
+ END IF
+ END IF
+
+* Initialize indices into workspaces. Note: The IWORK indices are
+* used only if SSTERF or SSTEMR fail.
+
+* WORK(INDTAU:INDTAU+N-1) stores the scalar factors of the
+* elementary reflectors used in SSYTRD.
+ INDTAU = 1
+* WORK(INDD:INDD+N-1) stores the tridiagonal's diagonal entries.
+ INDD = INDTAU + N
+* WORK(INDE:INDE+N-1) stores the off-diagonal entries of the
+* tridiagonal matrix from SSYTRD.
+ INDE = INDD + N
+* WORK(INDDD:INDDD+N-1) is a copy of the diagonal entries over
+* -written by SSTEMR (the SSTERF path copies the diagonal to W).
+ INDDD = INDE + N
+* WORK(INDEE:INDEE+N-1) is a copy of the off-diagonal entries over
+* -written while computing the eigenvalues in SSTERF and SSTEMR.
+ INDEE = INDDD + N
+* INDHOUS is the starting offset Householder storage of stage 2
+ INDHOUS = INDEE + N
+* INDWK is the starting offset of the left-over workspace, and
+* LLWORK is the remaining workspace size.
+ INDWK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWK + 1
+
+
+* IWORK(INDIBL:INDIBL+M-1) corresponds to IBLOCK in SSTEBZ and
+* stores the block indices of each of the M<=N eigenvalues.
+ INDIBL = 1
+* IWORK(INDISP:INDISP+NSPLIT-1) corresponds to ISPLIT in SSTEBZ and
+* stores the starting and finishing indices of each block.
+ INDISP = INDIBL + N
+* IWORK(INDIFL:INDIFL+N-1) stores the indices of eigenvectors
+* that corresponding to eigenvectors that fail to converge in
+* SSTEIN. This information is discarded; if any fail, the driver
+* returns INFO > 0.
+ INDIFL = INDISP + N
+* INDIWO is the offset of the remaining integer workspace.
+ INDIWO = INDIFL + N
+
+*
+* Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+*
+ CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ),
+ $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
+ $ LHTRD, WORK( INDWK ), LLWORK, IINFO )
+*
+* If all eigenvalues are desired
+* then call SSTERF or SSTEMR and SORMTR.
+*
+ TEST = .FALSE.
+ IF( INDEIG ) THEN
+ IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+ TEST = .TRUE.
+ END IF
+ END IF
+ IF( ( ALLEIG.OR.TEST ) .AND. ( IEEEOK.EQ.1 ) ) THEN
+ IF( .NOT.WANTZ ) THEN
+ CALL SCOPY( N, WORK( INDD ), 1, W, 1 )
+ CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+ CALL SSTERF( N, W, WORK( INDEE ), INFO )
+ ELSE
+ CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+ CALL SCOPY( N, WORK( INDD ), 1, WORK( INDDD ), 1 )
+*
+ IF (ABSTOL .LE. TWO*N*EPS) THEN
+ TRYRAC = .TRUE.
+ ELSE
+ TRYRAC = .FALSE.
+ END IF
+ CALL SSTEMR( JOBZ, 'A', N, WORK( INDDD ), WORK( INDEE ),
+ $ VL, VU, IL, IU, M, W, Z, LDZ, N, ISUPPZ,
+ $ TRYRAC, WORK( INDWK ), LWORK, IWORK, LIWORK,
+ $ INFO )
+*
+*
+*
+* Apply orthogonal matrix used in reduction to tridiagonal
+* form to eigenvectors returned by SSTEMR.
+*
+ IF( WANTZ .AND. INFO.EQ.0 ) THEN
+ INDWKN = INDE
+ LLWRKN = LWORK - INDWKN + 1
+ CALL SORMTR( 'L', UPLO, 'N', N, M, A, LDA,
+ $ WORK( INDTAU ), Z, LDZ, WORK( INDWKN ),
+ $ LLWRKN, IINFO )
+ END IF
+ END IF
+*
+*
+ IF( INFO.EQ.0 ) THEN
+* Everything worked. Skip SSTEBZ/SSTEIN. IWORK(:) are
+* undefined.
+ M = N
+ GO TO 30
+ END IF
+ INFO = 0
+ END IF
+*
+* Otherwise, call SSTEBZ and, if eigenvectors are desired, SSTEIN.
+* Also call SSTEBZ and SSTEIN if SSTEMR fails.
+*
+ IF( WANTZ ) THEN
+ ORDER = 'B'
+ ELSE
+ ORDER = 'E'
+ END IF
+
+ CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+ $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWK ),
+ $ IWORK( INDIWO ), INFO )
+*
+ IF( WANTZ ) THEN
+ CALL SSTEIN( N, WORK( INDD ), WORK( INDE ), M, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+ $ WORK( INDWK ), IWORK( INDIWO ), IWORK( INDIFL ),
+ $ INFO )
+*
+* Apply orthogonal matrix used in reduction to tridiagonal
+* form to eigenvectors returned by SSTEIN.
+*
+ INDWKN = INDE
+ LLWRKN = LWORK - INDWKN + 1
+ CALL SORMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+ $ LDZ, WORK( INDWKN ), LLWRKN, IINFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+* Jump here if SSTEMR/SSTEIN succeeded.
+ 30 CONTINUE
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = M
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* If eigenvalues are not in order, then sort them, along with
+* eigenvectors. Note: We do not sort the IFAIL portion of IWORK.
+* It may not be initialized (if SSTEMR/SSTEIN succeeded), and we do
+* not return this detailed information to the user.
+*
+ IF( WANTZ ) THEN
+ DO 50 J = 1, M - 1
+ I = 0
+ TMP1 = W( J )
+ DO 40 JJ = J + 1, M
+ IF( W( JJ ).LT.TMP1 ) THEN
+ I = JJ
+ TMP1 = W( JJ )
+ END IF
+ 40 CONTINUE
+*
+ IF( I.NE.0 ) THEN
+ W( I ) = W( J )
+ W( J ) = TMP1
+ CALL SSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+ END IF
+ 50 CONTINUE
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ RETURN
+*
+* End of SSYEVR_2STAGE
+*
+ END
diff --git a/SRC/ssyevx_2stage.f b/SRC/ssyevx_2stage.f
new file mode 100644
index 00000000..96a73ecd
--- /dev/null
+++ b/SRC/ssyevx_2stage.f
@@ -0,0 +1,608 @@
+*> \brief <b> SSYEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+* @generated from dsyevx_2stage.f, fortran d -> s, Sat Nov 5 23:55:46 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSYEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssyevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssyevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssyevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SSYEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+* IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+* LWORK, IWORK, IFAIL, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, RANGE, UPLO
+* INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N
+* REAL ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+* INTEGER IFAIL( * ), IWORK( * )
+* REAL A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SSYEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors can be
+*> selected by specifying either a range of values or a range of indices
+*> for the desired eigenvalues.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*> RANGE is CHARACTER*1
+*> = 'A': all eigenvalues will be found.
+*> = 'V': all eigenvalues in the half-open interval (VL,VU]
+*> will be found.
+*> = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is REAL array, dimension (LDA, N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, the lower triangle (if UPLO='L') or the upper
+*> triangle (if UPLO='U') of A, including the diagonal, is
+*> destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*> VL is REAL
+*> If RANGE='V', the lower bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*> VU is REAL
+*> If RANGE='V', the upper bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*> IL is INTEGER
+*> If RANGE='I', the index of the
+*> smallest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*> IU is INTEGER
+*> If RANGE='I', the index of the
+*> largest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*> ABSTOL is REAL
+*> The absolute error tolerance for the eigenvalues.
+*> An approximate eigenvalue is accepted as converged
+*> when it is determined to lie in an interval [a,b]
+*> of width less than or equal to
+*>
+*> ABSTOL + EPS * max( |a|,|b| ) ,
+*>
+*> where EPS is the machine precision. If ABSTOL is less than
+*> or equal to zero, then EPS*|T| will be used in its place,
+*> where |T| is the 1-norm of the tridiagonal matrix obtained
+*> by reducing A to tridiagonal form.
+*>
+*> Eigenvalues will be computed most accurately when ABSTOL is
+*> set to twice the underflow threshold 2*SLAMCH('S'), not zero.
+*> If this routine returns with INFO>0, indicating that some
+*> eigenvectors did not converge, try setting ABSTOL to
+*> 2*SLAMCH('S').
+*>
+*> See "Computing Small Singular Values of Bidiagonal Matrices
+*> with Guaranteed High Relative Accuracy," by Demmel and
+*> Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*> M is INTEGER
+*> The total number of eigenvalues found. 0 <= M <= N.
+*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> On normal exit, the first M elements contain the selected
+*> eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is REAL array, dimension (LDZ, max(1,M))
+*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*> contain the orthonormal eigenvectors of the matrix A
+*> corresponding to the selected eigenvalues, with the i-th
+*> column of Z holding the eigenvector associated with W(i).
+*> If an eigenvector fails to converge, then that column of Z
+*> contains the latest approximation to the eigenvector, and the
+*> index of the eigenvector is returned in IFAIL.
+*> If JOBZ = 'N', then Z is not referenced.
+*> Note: the user must ensure that at least max(1,M) columns are
+*> supplied in the array Z; if RANGE = 'V', the exact value of M
+*> is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is REAL array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, 8*N, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + 3*N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + 3*N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*> IFAIL is INTEGER array, dimension (N)
+*> If JOBZ = 'V', then if INFO = 0, the first M elements of
+*> IFAIL are zero. If INFO > 0, then IFAIL contains the
+*> indices of the eigenvectors that failed to converge.
+*> If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, then i eigenvectors failed to converge.
+*> Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup realSYeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE SSYEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+ $ IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+ $ LWORK, IWORK, IFAIL, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, RANGE, UPLO
+ INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N
+ REAL ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+ INTEGER IFAIL( * ), IWORK( * )
+ REAL A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ZERO, ONE
+ PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG,
+ $ WANTZ
+ CHARACTER ORDER
+ INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+ $ INDISP, INDIWO, INDTAU, INDWKN, INDWRK, ISCALE,
+ $ ITMP1, J, JJ, LLWORK, LLWRKN,
+ $ NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+ REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+ $ SIGMA, SMLNUM, TMP1, VLL, VUU
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ REAL SLAMCH, SLANSY
+ EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY
+* ..
+* .. External Subroutines ..
+ EXTERNAL SCOPY, SLACPY, SORGTR, SORMTR, SSCAL, SSTEBZ,
+ $ SSTEIN, SSTEQR, SSTERF, SSWAP, XERBLA,
+ $ SSYTRD_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, MIN, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ LOWER = LSAME( UPLO, 'L' )
+ WANTZ = LSAME( JOBZ, 'V' )
+ ALLEIG = LSAME( RANGE, 'A' )
+ VALEIG = LSAME( RANGE, 'V' )
+ INDEIG = LSAME( RANGE, 'I' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -6
+ ELSE
+ IF( VALEIG ) THEN
+ IF( N.GT.0 .AND. VU.LE.VL )
+ $ INFO = -8
+ ELSE IF( INDEIG ) THEN
+ IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+ INFO = -9
+ ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+ INFO = -10
+ END IF
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -15
+ END IF
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ WORK( 1 ) = LWMIN
+ ELSE
+ KD = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = MAX( 8*N, 3*N + LHTRD + LWTRD )
+ WORK( 1 ) = LWMIN
+ END IF
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -17
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSYEVX_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ M = 0
+ IF( N.EQ.0 ) THEN
+ RETURN
+ END IF
+*
+ IF( N.EQ.1 ) THEN
+ IF( ALLEIG .OR. INDEIG ) THEN
+ M = 1
+ W( 1 ) = A( 1, 1 )
+ ELSE
+ IF( VL.LT.A( 1, 1 ) .AND. VU.GE.A( 1, 1 ) ) THEN
+ M = 1
+ W( 1 ) = A( 1, 1 )
+ END IF
+ END IF
+ IF( WANTZ )
+ $ Z( 1, 1 ) = ONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = SLAMCH( 'Safe minimum' )
+ EPS = SLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ISCALE = 0
+ ABSTLL = ABSTOL
+ IF( VALEIG ) THEN
+ VLL = VL
+ VUU = VU
+ END IF
+ ANRM = SLANSY( 'M', UPLO, N, A, LDA, WORK )
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ DO 10 J = 1, N
+ CALL SSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+ 10 CONTINUE
+ ELSE
+ DO 20 J = 1, N
+ CALL SSCAL( J, SIGMA, A( 1, J ), 1 )
+ 20 CONTINUE
+ END IF
+ IF( ABSTOL.GT.0 )
+ $ ABSTLL = ABSTOL*SIGMA
+ IF( VALEIG ) THEN
+ VLL = VL*SIGMA
+ VUU = VU*SIGMA
+ END IF
+ END IF
+*
+* Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+ INDTAU = 1
+ INDE = INDTAU + N
+ INDD = INDE + N
+ INDHOUS = INDD + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ),
+ $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
+ $ LHTRD, WORK( INDWRK ), LLWORK, IINFO )
+*
+* If all eigenvalues are desired and ABSTOL is less than or equal to
+* zero, then call SSTERF or SORGTR and SSTEQR. If this fails for
+* some eigenvalue, then try SSTEBZ.
+*
+ TEST = .FALSE.
+ IF( INDEIG ) THEN
+ IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+ TEST = .TRUE.
+ END IF
+ END IF
+ IF( ( ALLEIG .OR. TEST ) .AND. ( ABSTOL.LE.ZERO ) ) THEN
+ CALL SCOPY( N, WORK( INDD ), 1, W, 1 )
+ INDEE = INDWRK + 2*N
+ IF( .NOT.WANTZ ) THEN
+ CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+ CALL SSTERF( N, W, WORK( INDEE ), INFO )
+ ELSE
+ CALL SLACPY( 'A', N, N, A, LDA, Z, LDZ )
+ CALL SORGTR( UPLO, N, Z, LDZ, WORK( INDTAU ),
+ $ WORK( INDWRK ), LLWORK, IINFO )
+ CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+ CALL SSTEQR( JOBZ, N, W, WORK( INDEE ), Z, LDZ,
+ $ WORK( INDWRK ), INFO )
+ IF( INFO.EQ.0 ) THEN
+ DO 30 I = 1, N
+ IFAIL( I ) = 0
+ 30 CONTINUE
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ M = N
+ GO TO 40
+ END IF
+ INFO = 0
+ END IF
+*
+* Otherwise, call SSTEBZ and, if eigenvectors are desired, SSTEIN.
+*
+ IF( WANTZ ) THEN
+ ORDER = 'B'
+ ELSE
+ ORDER = 'E'
+ END IF
+ INDIBL = 1
+ INDISP = INDIBL + N
+ INDIWO = INDISP + N
+ CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+ $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ),
+ $ IWORK( INDIWO ), INFO )
+*
+ IF( WANTZ ) THEN
+ CALL SSTEIN( N, WORK( INDD ), WORK( INDE ), M, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+ $ WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO )
+*
+* Apply orthogonal matrix used in reduction to tridiagonal
+* form to eigenvectors returned by SSTEIN.
+*
+ INDWKN = INDE
+ LLWRKN = LWORK - INDWKN + 1
+ CALL SORMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+ $ LDZ, WORK( INDWKN ), LLWRKN, IINFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ 40 CONTINUE
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = M
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* If eigenvalues are not in order, then sort them, along with
+* eigenvectors.
+*
+ IF( WANTZ ) THEN
+ DO 60 J = 1, M - 1
+ I = 0
+ TMP1 = W( J )
+ DO 50 JJ = J + 1, M
+ IF( W( JJ ).LT.TMP1 ) THEN
+ I = JJ
+ TMP1 = W( JJ )
+ END IF
+ 50 CONTINUE
+*
+ IF( I.NE.0 ) THEN
+ ITMP1 = IWORK( INDIBL+I-1 )
+ W( I ) = W( J )
+ IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+ W( J ) = TMP1
+ IWORK( INDIBL+J-1 ) = ITMP1
+ CALL SSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+ IF( INFO.NE.0 ) THEN
+ ITMP1 = IFAIL( I )
+ IFAIL( I ) = IFAIL( J )
+ IFAIL( J ) = ITMP1
+ END IF
+ END IF
+ 60 CONTINUE
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of SSYEVX_2STAGE
+*
+ END
diff --git a/SRC/ssygv_2stage.f b/SRC/ssygv_2stage.f
new file mode 100644
index 00000000..6eb172e9
--- /dev/null
+++ b/SRC/ssygv_2stage.f
@@ -0,0 +1,371 @@
+*> \brief \b SSYGV_2STAGE
+*
+* @generated from dsygv_2stage.f, fortran d -> s, Sun Nov 6 12:54:29 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSYGV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssygv_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssygv_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssygv_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SSYGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+* WORK, LWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, ITYPE, LDA, LDB, LWORK, N
+* ..
+* .. Array Arguments ..
+* REAL A( LDA, * ), B( LDB, * ), W( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SSYGV_2STAGE computes all the eigenvalues, and optionally, the eigenvectors
+*> of a real generalized symmetric-definite eigenproblem, of the form
+*> A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x.
+*> Here A and B are assumed to be symmetric and B is also
+*> positive definite.
+*> This routine use the 2stage technique for the reduction to tridiagonal
+*> which showed higher performance on recent architecture and for large
+* sizes N>2000.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] ITYPE
+*> \verbatim
+*> ITYPE is INTEGER
+*> Specifies the problem type to be solved:
+*> = 1: A*x = (lambda)*B*x
+*> = 2: A*B*x = (lambda)*x
+*> = 3: B*A*x = (lambda)*x
+*> \endverbatim
+*>
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangles of A and B are stored;
+*> = 'L': Lower triangles of A and B are stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrices A and B. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is REAL array, dimension (LDA, N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*>
+*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*> matrix Z of eigenvectors. The eigenvectors are normalized
+*> as follows:
+*> if ITYPE = 1 or 2, Z**T*B*Z = I;
+*> if ITYPE = 3, Z**T*inv(B)*Z = I.
+*> If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
+*> or the lower triangle (if UPLO='L') of A, including the
+*> diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*> B is REAL array, dimension (LDB, N)
+*> On entry, the symmetric positive definite matrix B.
+*> If UPLO = 'U', the leading N-by-N upper triangular part of B
+*> contains the upper triangular part of the matrix B.
+*> If UPLO = 'L', the leading N-by-N lower triangular part of B
+*> contains the lower triangular part of the matrix B.
+*>
+*> On exit, if INFO <= N, the part of B containing the matrix is
+*> overwritten by the triangular factor U or L from the Cholesky
+*> factorization B = U**T*U or B = L*L**T.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*> LDB is INTEGER
+*> The leading dimension of the array B. LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is REAL array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is REAL array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + 2*N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + 2*N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: SPOTRF or SSYEV returned an error code:
+*> <= N: if INFO = i, SSYEV failed to converge;
+*> i off-diagonal elements of an intermediate
+*> tridiagonal form did not converge to zero;
+*> > N: if INFO = N + i, for 1 <= i <= N, then the leading
+*> minor of order i of B is not positive definite.
+*> The factorization of B could not be completed and
+*> no eigenvalues or eigenvectors were computed.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup realSYeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE SSYGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+ $ WORK, LWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, ITYPE, LDA, LDB, LWORK, N
+* ..
+* .. Array Arguments ..
+ REAL A( LDA, * ), B( LDB, * ), W( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL ONE
+ PARAMETER ( ONE = 1.0E+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, UPPER, WANTZ
+ CHARACTER TRANS
+ INTEGER NEIG,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL SPOTRF, SSYGST, STRMM, STRSM, XERBLA,
+ $ SSYEV_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( ITYPE.LT.1 .OR. ITYPE.GT.3 ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( UPPER .OR. LSAME( UPLO, 'L' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -6
+ ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
+ INFO = -8
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ KD = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = 2*N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -11
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSYGV_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+* Form a Cholesky factorization of B.
+*
+ CALL SPOTRF( UPLO, N, B, LDB, INFO )
+ IF( INFO.NE.0 ) THEN
+ INFO = N + INFO
+ RETURN
+ END IF
+*
+* Transform problem to standard eigenvalue problem and solve.
+*
+ CALL SSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
+ CALL SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, INFO )
+*
+ IF( WANTZ ) THEN
+*
+* Backtransform eigenvectors to the original problem.
+*
+ NEIG = N
+ IF( INFO.GT.0 )
+ $ NEIG = INFO - 1
+ IF( ITYPE.EQ.1 .OR. ITYPE.EQ.2 ) THEN
+*
+* For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
+* backtransform eigenvectors: x = inv(L)**T*y or inv(U)*y
+*
+ IF( UPPER ) THEN
+ TRANS = 'N'
+ ELSE
+ TRANS = 'T'
+ END IF
+*
+ CALL STRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+ $ B, LDB, A, LDA )
+*
+ ELSE IF( ITYPE.EQ.3 ) THEN
+*
+* For B*A*x=(lambda)*x;
+* backtransform eigenvectors: x = L*y or U**T*y
+*
+ IF( UPPER ) THEN
+ TRANS = 'T'
+ ELSE
+ TRANS = 'N'
+ END IF
+*
+ CALL STRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+ $ B, LDB, A, LDA )
+ END IF
+ END IF
+*
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of SSYGV_2STAGE
+*
+ END
diff --git a/SRC/ssytrd_2stage.f b/SRC/ssytrd_2stage.f
new file mode 100644
index 00000000..fba3dd45
--- /dev/null
+++ b/SRC/ssytrd_2stage.f
@@ -0,0 +1,337 @@
+*> \brief \b SSYTRD_2STAGE
+*
+* @generated from zhetrd_2stage.f, fortran z -> s, Sun Nov 6 19:34:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSYTRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
+* HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER VECT, UPLO
+* INTEGER N, LDA, LWORK, LHOUS2, INFO
+* ..
+* .. Array Arguments ..
+* REAL D( * ), E( * )
+* REAL A( LDA, * ), TAU( * ),
+* HOUS2( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SSYTRD_2STAGE reduces a real symmetric matrix A to real symmetric
+*> tridiagonal form T by a orthogonal similarity transformation:
+*> Q1**T Q2**T* A * Q2 * Q1 = T.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] VECT
+*> \verbatim
+*> VECT is CHARACTER*1
+*> = 'N': No need for the Housholder representation,
+*> in particular for the second stage (Band to
+*> tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
+*> = 'V': the Householder representation is needed to
+*> either generate Q1 Q2 or to apply Q1 Q2,
+*> then LHOUS2 is to be queried and computed.
+*> (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is REAL array, dimension (LDA,N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the leading
+*> N-by-N upper triangular part of A contains the upper
+*> triangular part of the matrix A, and the strictly lower
+*> triangular part of A is not referenced. If UPLO = 'L', the
+*> leading N-by-N lower triangular part of A contains the lower
+*> triangular part of the matrix A, and the strictly upper
+*> triangular part of A is not referenced.
+*> On exit, if UPLO = 'U', the band superdiagonal
+*> of A are overwritten by the corresponding elements of the
+*> internal band-diagonal matrix AB, and the elements above
+*> the KD superdiagonal, with the array TAU, represent the orthogonal
+*> matrix Q1 as a product of elementary reflectors; if UPLO
+*> = 'L', the diagonal and band subdiagonal of A are over-
+*> written by the corresponding elements of the internal band-diagonal
+*> matrix AB, and the elements below the KD subdiagonal, with
+*> the array TAU, represent the orthogonal matrix Q1 as a product
+*> of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*> D is REAL array, dimension (N)
+*> The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*> E is REAL array, dimension (N-1)
+*> The off-diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*> TAU is REAL array, dimension (N-KD)
+*> The scalar factors of the elementary reflectors of
+*> the first stage (see Further Details).
+*> \endverbatim
+*>
+*> \param[out] HOUS2
+*> \verbatim
+*> HOUS2 is REAL array, dimension LHOUS2, that
+*> store the Householder representation of the stage2
+*> band to tridiagonal.
+*> \endverbatim
+*>
+*> \param[in] LHOUS2
+*> \verbatim
+*> LHOUS2 is INTEGER
+*> The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS2=-1,
+*> then a query is assumed; the routine
+*> only calculates the optimal size of the HOUS2 array, returns
+*> this value as the first entry of the HOUS2 array, and no error
+*> message related to LHOUS2 is issued by XERBLA.
+*> LHOUS2 = MAX(1, dimension) where
+*> dimension = 4*N if VECT='N'
+*> not available now if VECT='H'
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is REAL array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK. LWORK = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS2=-1,
+*> then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup realSYcomputational
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
+ $ HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER VECT, UPLO
+ INTEGER N, LDA, LWORK, LHOUS2, INFO
+* ..
+* .. Array Arguments ..
+ REAL D( * ), E( * )
+ REAL A( LDA, * ), TAU( * ),
+ $ HOUS2( * ), WORK( * )
+* ..
+*
+* =====================================================================
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, UPPER, WANTQ
+ INTEGER KD, IB, LWMIN, LHMIN, LWRK, LDAB, WPOS, ABPOS
+* ..
+* .. External Subroutines ..
+ EXTERNAL XERBLA, SSYTRD_SY2SB, SSYTRD_SB2ST
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters
+*
+ INFO = 0
+ WANTQ = LSAME( VECT, 'V' )
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS2.EQ.-1 )
+*
+* Determine the block size, the workspace size and the hous size.
+*
+ KD = ILAENV( 17, 'SSYTRD_2STAGE', VECT, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'SSYTRD_2STAGE', VECT, N, KD, -1, -1 )
+ LHMIN = ILAENV( 19, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+ LWMIN = ILAENV( 20, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+* WRITE(*,*),'SSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
+* $ LHMIN, LWMIN
+*
+ IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ ELSE IF( LHOUS2.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -10
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -12
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ HOUS2( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSYTRD_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 ) THEN
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+* Determine pointer position
+*
+ LDAB = KD+1
+ LWRK = LWORK-LDAB*N
+ ABPOS = 1
+ WPOS = ABPOS + LDAB*N
+ CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
+ $ TAU, WORK( WPOS ), LWRK, INFO )
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSYTRD_SY2SB', -INFO )
+ RETURN
+ END IF
+ CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD,
+ $ WORK( ABPOS ), LDAB, D, E,
+ $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSYTRD_SB2ST', -INFO )
+ RETURN
+ END IF
+*
+*
+ HOUS2( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of SSYTRD_2STAGE
+*
+ END
diff --git a/SRC/ssytrd_sb2st.F b/SRC/ssytrd_sb2st.F
new file mode 100644
index 00000000..edbcf125
--- /dev/null
+++ b/SRC/ssytrd_sb2st.F
@@ -0,0 +1,603 @@
+*> \brief \b SSBTRD
+*
+* @generated from zhetrd_hb2st.F, fortran z -> s, Sun Nov 6 19:34:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSBTRD + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssbtrd.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssbtrd.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssbtrd.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
+* D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+* #define PRECISION_REAL
+*
+* #if defined(_OPENMP)
+* use omp_lib
+* #endif
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER STAGE1, UPLO, VECT
+* INTEGER N, KD, IB, LDAB, LHOUS, LWORK, INFO
+* ..
+* .. Array Arguments ..
+* REAL D( * ), E( * )
+* REAL AB( LDAB, * ), HOUS( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SSBTRD reduces a real symmetric band matrix A to real symmetric
+*> tridiagonal form T by a orthogonal similarity transformation:
+*> Q**T * A * Q = T.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] STAGE
+*> \verbatim
+*> STAGE is CHARACTER*1
+*> = 'N': "No": to mention that the stage 1 of the reduction
+*> from dense to band using the ssytrd_sy2sb routine
+*> was not called before this routine to reproduce AB.
+*> In other term this routine is called as standalone.
+*> = 'Y': "Yes": to mention that the stage 1 of the
+*> reduction from dense to band using the ssytrd_sy2sb
+*> routine has been called to produce AB (e.g., AB is
+*> the output of ssytrd_sy2sb.
+*> \endverbatim
+*>
+*> \param[in] VECT
+*> \verbatim
+*> VECT is CHARACTER*1
+*> = 'N': No need for the Housholder representation,
+*> and thus LHOUS is of size max(1, 4*N);
+*> = 'V': the Householder representation is needed to
+*> either generate or to apply Q later on,
+*> then LHOUS is to be queried and computed.
+*> (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is REAL array, dimension (LDAB,N)
+*> On entry, the upper or lower triangle of the symmetric band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*> On exit, the diagonal elements of AB are overwritten by the
+*> diagonal elements of the tridiagonal matrix T; if KD > 0, the
+*> elements on the first superdiagonal (if UPLO = 'U') or the
+*> first subdiagonal (if UPLO = 'L') are overwritten by the
+*> off-diagonal elements of T; the rest of AB is overwritten by
+*> values generated during the reduction.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*> D is REAL array, dimension (N)
+*> The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*> E is REAL array, dimension (N-1)
+*> The off-diagonal elements of the tridiagonal matrix T:
+*> E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
+*> \endverbatim
+*>
+*> \param[out] HOUS
+*> \verbatim
+*> HOUS is REAL array, dimension LHOUS, that
+*> store the Householder representation.
+*> \endverbatim
+*>
+*> \param[in] LHOUS
+*> \verbatim
+*> LHOUS is INTEGER
+*> The dimension of the array HOUS. LHOUS = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS=-1,
+*> then a query is assumed; the routine
+*> only calculates the optimal size of the HOUS array, returns
+*> this value as the first entry of the HOUS array, and no error
+*> message related to LHOUS is issued by XERBLA.
+*> LHOUS = MAX(1, dimension) where
+*> dimension = 4*N if VECT='N'
+*> not available now if VECT='H'
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is REAL array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK. LWORK = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS=-1,
+*> then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup real16OTHERcomputational
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
+ $ D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+#define PRECISION_REAL
+*
+#if defined(_OPENMP)
+ use omp_lib
+#endif
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER STAGE1, UPLO, VECT
+ INTEGER N, KD, LDAB, LHOUS, LWORK, INFO
+* ..
+* .. Array Arguments ..
+ REAL D( * ), E( * )
+ REAL AB( LDAB, * ), HOUS( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL RZERO
+ REAL ZERO, ONE
+ PARAMETER ( RZERO = 0.0E+0,
+ $ ZERO = 0.0E+0,
+ $ ONE = 1.0E+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, WANTQ, UPPER, AFTERS1
+ INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
+ $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
+ $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
+ $ NBTILES, TTYPE, TID, NTHREADS, DEBUG,
+ $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
+ $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
+ $ SISEV, SIZETAU, LDV, LHMIN, LWMIN
+#if defined (PRECISION_COMPLEX)
+ REAL ABSTMP
+ REAL TMP
+#endif
+* ..
+* .. External Subroutines ..
+ EXTERNAL SSB2ST_KERNELS, SLACPY, SLASET
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN, MAX, CEILING, REAL
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Determine the minimal workspace size required.
+* Test the input parameters
+*
+ DEBUG = 0
+ INFO = 0
+ AFTERS1 = LSAME( STAGE1, 'Y' )
+ WANTQ = LSAME( VECT, 'V' )
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 )
+*
+* Determine the block size, the workspace size and the hous size.
+*
+ IB = ILAENV( 18, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
+ LHMIN = ILAENV( 19, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+ LWMIN = ILAENV( 20, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+*
+ IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -5
+ ELSE IF( LDAB.LT.(KD+1) ) THEN
+ INFO = -7
+ ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -11
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -13
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ HOUS( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSYTRD_SB2ST', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 ) THEN
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+* Determine pointer position
+*
+ LDV = KD + IB
+ SIZETAU = 2 * N
+ SISEV = 2 * N
+ INDTAU = 1
+ INDV = INDTAU + SIZETAU
+ LDA = 2 * KD + 1
+ SIZEA = LDA * N
+ INDA = 1
+ INDW = INDA + SIZEA
+ NTHREADS = 1
+ TID = 0
+*
+ IF( UPPER ) THEN
+ APOS = INDA + KD
+ AWPOS = INDA
+ DPOS = APOS + KD
+ OFDPOS = DPOS - 1
+ ABDPOS = KD + 1
+ ABOFDPOS = KD
+ ELSE
+ APOS = INDA
+ AWPOS = INDA + KD + 1
+ DPOS = APOS
+ OFDPOS = DPOS + 1
+ ABDPOS = 1
+ ABOFDPOS = 2
+
+ ENDIF
+*
+* Case KD=0:
+* The matrix is diagonal. We just copy it (convert to "real" for
+* real because D is double and the imaginary part should be 0)
+* and store it in D. A sequential code here is better or
+* in a parallel environment it might need two cores for D and E
+*
+ IF( KD.EQ.0 ) THEN
+ DO 30 I = 1, N
+ D( I ) = ( AB( ABDPOS, I ) )
+ 30 CONTINUE
+ DO 40 I = 1, N-1
+ E( I ) = RZERO
+ 40 CONTINUE
+ GOTO 200
+ END IF
+*
+* Case KD=1:
+* The matrix is already Tridiagonal. We have to make diagonal
+* and offdiagonal elements real, and store them in D and E.
+* For that, for real precision just copy the diag and offdiag
+* to D and E while for the COMPLEX case the bulge chasing is
+* performed to convert the hermetian tridiagonal to symmetric
+* tridiagonal. A simpler coversion formula might be used, but then
+* updating the Q matrix will be required and based if Q is generated
+* or not this might complicate the story.
+*
+C IF( KD.EQ.1 .AND. N.GT.(KD+1) .AND. AFTERS1 ) THEN
+ IF( KD.EQ.1 ) THEN
+ DO 50 I = 1, N
+ D( I ) = ( AB( ABDPOS, I ) )
+ 50 CONTINUE
+#if defined (PRECISION_COMPLEX)
+*
+* make off-diagonal elements real and copy them to E
+*
+ IF( UPPER ) THEN
+ DO 60 I = 1, N - 1
+ TMP = AB( ABOFDPOS, I+1 )
+ ABSTMP = ABS( TMP )
+ AB( ABOFDPOS, I+1 ) = ABSTMP
+ E( I ) = ABSTMP
+ IF( ABSTMP.NE.RZERO ) THEN
+ TMP = TMP / ABSTMP
+ ELSE
+ TMP = ONE
+ END IF
+ IF( I.LT.N-1 )
+ $ AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP
+C IF( WANTZ ) THEN
+C CALL SSCAL( N, ( TMP ), Q( 1, I+1 ), 1 )
+C END IF
+ 60 CONTINUE
+ ELSE
+ DO 70 I = 1, N - 1
+ TMP = AB( ABOFDPOS, I )
+ ABSTMP = ABS( TMP )
+ AB( ABOFDPOS, I ) = ABSTMP
+ E( I ) = ABSTMP
+ IF( ABSTMP.NE.RZERO ) THEN
+ TMP = TMP / ABSTMP
+ ELSE
+ TMP = ONE
+ END IF
+ IF( I.LT.N-1 )
+ $ AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP
+C IF( WANTQ ) THEN
+C CALL SSCAL( N, TMP, Q( 1, I+1 ), 1 )
+C END IF
+ 70 CONTINUE
+ ENDIF
+#else
+ IF( UPPER ) THEN
+ DO 60 I = 1, N-1
+ E( I ) = ( AB( ABOFDPOS, I+1 ) )
+ 60 CONTINUE
+ ELSE
+ DO 70 I = 1, N-1
+ E( I ) = ( AB( ABOFDPOS, I ) )
+ 70 CONTINUE
+ ENDIF
+#endif
+ GOTO 200
+ END IF
+*
+* Main code start here.
+* Reduce the symmetric band of A to a tridiagonal matrix.
+*
+ THGRSIZ = N
+ GRSIZ = 1
+ SHIFT = 3
+ NBTILES = CEILING( REAL(N)/REAL(KD) )
+ STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
+ THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) )
+*
+ CALL SLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
+ CALL SLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA )
+*
+*
+* openMP parallelisation start here
+*
+#if defined(_OPENMP)
+!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
+!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
+!$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
+!$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
+!$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
+!$OMP$ SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+!$OMP MASTER
+#endif
+*
+* main bulge chasing loop
+*
+ DO 100 THGRID = 1, THGRNB
+ STT = (THGRID-1)*THGRSIZ+1
+ THED = MIN( (STT + THGRSIZ -1), (N-1))
+ DO 110 I = STT, N-1
+ ED = MIN( I, THED )
+ IF( STT.GT.ED ) GOTO 100
+ DO 120 M = 1, STEPERCOL
+ ST = STT
+ DO 130 SWEEPID = ST, ED
+ DO 140 K = 1, GRSIZ
+ MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ)
+ $ + (M-1)*GRSIZ + K
+ IF ( MYID.EQ.1 ) THEN
+ TTYPE = 1
+ ELSE
+ TTYPE = MOD( MYID, 2 ) + 2
+ ENDIF
+
+ IF( TTYPE.EQ.2 ) THEN
+ COLPT = (MYID/2)*KD + SWEEPID
+ STIND = COLPT-KD+1
+ EDIND = MIN(COLPT,N)
+ BLKLASTIND = COLPT
+ ELSE
+ COLPT = ((MYID+1)/2)*KD + SWEEPID
+ STIND = COLPT-KD+1
+ EDIND = MIN(COLPT,N)
+ IF( ( STIND.GE.EDIND-1 ).AND.
+ $ ( EDIND.EQ.N ) ) THEN
+ BLKLASTIND = N
+ ELSE
+ BLKLASTIND = 0
+ ENDIF
+ ENDIF
+*
+* Call the kernel
+*
+#if defined(_OPENMP)
+ IF( TTYPE.NE.1 ) THEN
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$ DEPEND(in:WORK(MYID-1))
+!$OMP$ DEPEND(out:WORK(MYID))
+ TID = OMP_GET_THREAD_NUM()
+ CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
+ $ STIND, EDIND, SWEEPID, N, KD, IB,
+ $ WORK ( INDA ), LDA,
+ $ HOUS( INDV ), HOUS( INDTAU ), LDV,
+ $ WORK( INDW + TID*KD ) )
+!$OMP END TASK
+ ELSE
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$ DEPEND(out:WORK(MYID))
+ TID = OMP_GET_THREAD_NUM()
+ CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
+ $ STIND, EDIND, SWEEPID, N, KD, IB,
+ $ WORK ( INDA ), LDA,
+ $ HOUS( INDV ), HOUS( INDTAU ), LDV,
+ $ WORK( INDW + TID*KD ) )
+!$OMP END TASK
+ ENDIF
+#else
+ CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
+ $ STIND, EDIND, SWEEPID, N, KD, IB,
+ $ WORK ( INDA ), LDA,
+ $ HOUS( INDV ), HOUS( INDTAU ), LDV,
+ $ WORK( INDW + TID*KD ) )
+#endif
+ IF ( BLKLASTIND.GE.(N-1) ) THEN
+ STT = STT + 1
+ GOTO 130
+ ENDIF
+ 140 CONTINUE
+ 130 CONTINUE
+ 120 CONTINUE
+ 110 CONTINUE
+ 100 CONTINUE
+*
+#if defined(_OPENMP)
+!$OMP END MASTER
+!$OMP END PARALLEL
+#endif
+*
+* Copy the diagonal from A to D. Note that D is REAL thus only
+* the Real part is needed, the imaginary part should be zero.
+*
+ DO 150 I = 1, N
+ D( I ) = ( WORK( DPOS+(I-1)*LDA ) )
+ 150 CONTINUE
+*
+* Copy the off diagonal from A to E. Note that E is REAL thus only
+* the Real part is needed, the imaginary part should be zero.
+*
+ IF( UPPER ) THEN
+ DO 160 I = 1, N-1
+ E( I ) = ( WORK( OFDPOS+I*LDA ) )
+ 160 CONTINUE
+ ELSE
+ DO 170 I = 1, N-1
+ E( I ) = ( WORK( OFDPOS+(I-1)*LDA ) )
+ 170 CONTINUE
+ ENDIF
+*
+ 200 CONTINUE
+*
+ HOUS( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of SSYTRD_SB2ST
+*
+ END
+#undef PRECISION_REAL
+
diff --git a/SRC/ssytrd_sy2sb.f b/SRC/ssytrd_sy2sb.f
new file mode 100644
index 00000000..3dbbaf1f
--- /dev/null
+++ b/SRC/ssytrd_sy2sb.f
@@ -0,0 +1,517 @@
+*> \brief \b SSYTRD_SY2SB
+*
+* @generated from zhetrd_he2hb.f, fortran z -> s, Sun Nov 6 19:34:06 2016
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSYTRD_SY2SB + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SSYTRD_SY2SB( UPLO, N, KD, A, LDA, AB, LDAB, TAU,
+* WORK, LWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER UPLO
+* INTEGER INFO, LDA, LDAB, LWORK, N, KD
+* ..
+* .. Array Arguments ..
+* REAL A( LDA, * ), AB( LDAB, * ),
+* TAU( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SSYTRD_SY2SB reduces a real symmetric matrix A to real symmetric
+*> band-diagonal form AB by a orthogonal similarity transformation:
+*> Q**T * A * Q = AB.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the reduced matrix if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> The reduced matrix is stored in the array AB.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is REAL array, dimension (LDA,N)
+*> On entry, the symmetric matrix A. If UPLO = 'U', the leading
+*> N-by-N upper triangular part of A contains the upper
+*> triangular part of the matrix A, and the strictly lower
+*> triangular part of A is not referenced. If UPLO = 'L', the
+*> leading N-by-N lower triangular part of A contains the lower
+*> triangular part of the matrix A, and the strictly upper
+*> triangular part of A is not referenced.
+*> On exit, if UPLO = 'U', the diagonal and first superdiagonal
+*> of A are overwritten by the corresponding elements of the
+*> tridiagonal matrix T, and the elements above the first
+*> superdiagonal, with the array TAU, represent the orthogonal
+*> matrix Q as a product of elementary reflectors; if UPLO
+*> = 'L', the diagonal and first subdiagonal of A are over-
+*> written by the corresponding elements of the tridiagonal
+*> matrix T, and the elements below the first subdiagonal, with
+*> the array TAU, represent the orthogonal matrix Q as a product
+*> of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] AB
+*> \verbatim
+*> AB is REAL array, dimension (LDAB,N)
+*> On exit, the upper or lower triangle of the symmetric band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*> TAU is REAL array, dimension (N-KD)
+*> The scalar factors of the elementary reflectors (see Further
+*> Details).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is REAL array, dimension LWORK.
+*> On exit, if INFO = 0, or if LWORK=-1,
+*> WORK(1) returns the size of LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK which should be calculated
+* by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> LWORK_QUERY = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD
+*> where FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice otherwise
+*> putting LWORK=-1 will provide the size of WORK.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup realSYcomputational
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+*> \verbatim
+*>
+*> If UPLO = 'U', the matrix Q is represented as a product of elementary
+*> reflectors
+*>
+*> Q = H(k)**T . . . H(2)**T H(1)**T, where k = n-kd.
+*>
+*> Each H(i) has the form
+*>
+*> H(i) = I - tau * v * v**T
+*>
+*> where tau is a real scalar, and v is a real vector with
+*> v(1:i+kd-1) = 0 and v(i+kd) = 1; conjg(v(i+kd+1:n)) is stored on exit in
+*> A(i,i+kd+1:n), and tau in TAU(i).
+*>
+*> If UPLO = 'L', the matrix Q is represented as a product of elementary
+*> reflectors
+*>
+*> Q = H(1) H(2) . . . H(k), where k = n-kd.
+*>
+*> Each H(i) has the form
+*>
+*> H(i) = I - tau * v * v**T
+*>
+*> where tau is a real scalar, and v is a real vector with
+*> v(kd+1:i) = 0 and v(i+kd+1) = 1; v(i+kd+2:n) is stored on exit in
+* A(i+kd+2:n,i), and tau in TAU(i).
+*>
+*> The contents of A on exit are illustrated by the following examples
+*> with n = 5:
+*>
+*> if UPLO = 'U': if UPLO = 'L':
+*>
+*> ( ab ab/v1 v1 v1 v1 ) ( ab )
+*> ( ab ab/v2 v2 v2 ) ( ab/v1 ab )
+*> ( ab ab/v3 v3 ) ( v1 ab/v2 ab )
+*> ( ab ab/v4 ) ( v1 v2 ab/v3 ab )
+*> ( ab ) ( v1 v2 v3 ab/v4 ab )
+*>
+*> where d and e denote diagonal and off-diagonal elements of T, and vi
+*> denotes an element of the vector defining H(i).
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE SSYTRD_SY2SB( UPLO, N, KD, A, LDA, AB, LDAB, TAU,
+ $ WORK, LWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER UPLO
+ INTEGER INFO, LDA, LDAB, LWORK, N, KD
+* ..
+* .. Array Arguments ..
+ REAL A( LDA, * ), AB( LDAB, * ),
+ $ TAU( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL RONE
+ REAL ZERO, ONE, HALF
+ PARAMETER ( RONE = 1.0E+0,
+ $ ZERO = 0.0E+0,
+ $ ONE = 1.0E+0,
+ $ HALF = 0.5E+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, UPPER
+ INTEGER I, J, IINFO, LWMIN, PN, PK, LK,
+ $ LDT, LDW, LDS2, LDS1,
+ $ LS2, LS1, LW, LT,
+ $ TPOS, WPOS, S2POS, S1POS
+* ..
+* .. External Subroutines ..
+ EXTERNAL XERBLA, SSYR2K, SSYMM, SGEMM,
+ $ SLARFT, SGELQF, SGEQRF, SLASET
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN, MAX
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Determine the minimal workspace size required
+* and test the input parameters
+*
+ INFO = 0
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 )
+ LWMIN = ILAENV( 20, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
+
+ IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+ INFO = -1
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -2
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ ELSE IF( LDAB.LT.MAX( 1, KD+1 ) ) THEN
+ INFO = -7
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -10
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'SSYTRD_SY2SB', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ WORK( 1 ) = LWMIN
+ RETURN
+ END IF
+*
+* Quick return if possible
+* Copy the upper/lower portion of A into AB
+*
+ IF( N.LE.KD+1 ) THEN
+ IF( UPPER ) THEN
+ DO 100 I = 1, N
+ LK = MIN( KD+1, I )
+ CALL SCOPY( LK, A( I-LK+1, I ), 1,
+ $ AB( KD+1-LK+1, I ), 1 )
+ 100 CONTINUE
+ ELSE
+ DO 110 I = 1, N
+ LK = MIN( KD+1, N-I+1 )
+ CALL SCOPY( LK, A( I, I ), 1, AB( 1, I ), 1 )
+ 110 CONTINUE
+ ENDIF
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+* Determine the pointer position for the workspace
+*
+ LDT = KD
+ LDS1 = KD
+ LT = LDT*KD
+ LW = N*KD
+ LS1 = LDS1*KD
+ LS2 = LWMIN - LT - LW - LS1
+* LS2 = N*MAX(KD,FACTOPTNB)
+ TPOS = 1
+ WPOS = TPOS + LT
+ S1POS = WPOS + LW
+ S2POS = S1POS + LS1
+ IF( UPPER ) THEN
+ LDW = KD
+ LDS2 = KD
+ ELSE
+ LDW = N
+ LDS2 = N
+ ENDIF
+*
+*
+* Set the workspace of the triangular matrix T to zero once such a
+* way everytime T is generated the upper/lower portion will be always zero
+*
+ CALL SLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT )
+*
+ IF( UPPER ) THEN
+ DO 10 I = 1, N - KD, KD
+ PN = N-I-KD+1
+ PK = MIN( N-I-KD+1, KD )
+*
+* Compute the LQ factorization of the current block
+*
+ CALL SGELQF( KD, PN, A( I, I+KD ), LDA,
+ $ TAU( I ), WORK( S2POS ), LS2, IINFO )
+*
+* Copy the upper portion of A into AB
+*
+ DO 20 J = I, I+PK-1
+ LK = MIN( KD, N-J ) + 1
+ CALL SCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+ 20 CONTINUE
+*
+ CALL SLASET( 'Lower', PK, PK, ZERO, ONE,
+ $ A( I, I+KD ), LDA )
+*
+* Form the matrix T
+*
+ CALL SLARFT( 'Forward', 'Rowwise', PN, PK,
+ $ A( I, I+KD ), LDA, TAU( I ),
+ $ WORK( TPOS ), LDT )
+*
+* Compute W:
+*
+ CALL SGEMM( 'Conjugate', 'No transpose', PK, PN, PK,
+ $ ONE, WORK( TPOS ), LDT,
+ $ A( I, I+KD ), LDA,
+ $ ZERO, WORK( S2POS ), LDS2 )
+*
+ CALL SSYMM( 'Right', UPLO, PK, PN,
+ $ ONE, A( I+KD, I+KD ), LDA,
+ $ WORK( S2POS ), LDS2,
+ $ ZERO, WORK( WPOS ), LDW )
+*
+ CALL SGEMM( 'No transpose', 'Conjugate', PK, PK, PN,
+ $ ONE, WORK( WPOS ), LDW,
+ $ WORK( S2POS ), LDS2,
+ $ ZERO, WORK( S1POS ), LDS1 )
+*
+ CALL SGEMM( 'No transpose', 'No transpose', PK, PN, PK,
+ $ -HALF, WORK( S1POS ), LDS1,
+ $ A( I, I+KD ), LDA,
+ $ ONE, WORK( WPOS ), LDW )
+*
+*
+* Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+* an update of the form: A := A - V'*W - W'*V
+*
+ CALL SSYR2K( UPLO, 'Conjugate', PN, PK,
+ $ -ONE, A( I, I+KD ), LDA,
+ $ WORK( WPOS ), LDW,
+ $ RONE, A( I+KD, I+KD ), LDA )
+ 10 CONTINUE
+*
+* Copy the upper band to AB which is the band storage matrix
+*
+ DO 30 J = N-KD+1, N
+ LK = MIN(KD, N-J) + 1
+ CALL SCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+ 30 CONTINUE
+*
+ ELSE
+*
+* Reduce the lower triangle of A to lower band matrix
+*
+ DO 40 I = 1, N - KD, KD
+ PN = N-I-KD+1
+ PK = MIN( N-I-KD+1, KD )
+*
+* Compute the QR factorization of the current block
+*
+ CALL SGEQRF( PN, KD, A( I+KD, I ), LDA,
+ $ TAU( I ), WORK( S2POS ), LS2, IINFO )
+*
+* Copy the upper portion of A into AB
+*
+ DO 50 J = I, I+PK-1
+ LK = MIN( KD, N-J ) + 1
+ CALL SCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+ 50 CONTINUE
+*
+ CALL SLASET( 'Upper', PK, PK, ZERO, ONE,
+ $ A( I+KD, I ), LDA )
+*
+* Form the matrix T
+*
+ CALL SLARFT( 'Forward', 'Columnwise', PN, PK,
+ $ A( I+KD, I ), LDA, TAU( I ),
+ $ WORK( TPOS ), LDT )
+*
+* Compute W:
+*
+ CALL SGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+ $ ONE, A( I+KD, I ), LDA,
+ $ WORK( TPOS ), LDT,
+ $ ZERO, WORK( S2POS ), LDS2 )
+*
+ CALL SSYMM( 'Left', UPLO, PN, PK,
+ $ ONE, A( I+KD, I+KD ), LDA,
+ $ WORK( S2POS ), LDS2,
+ $ ZERO, WORK( WPOS ), LDW )
+*
+ CALL SGEMM( 'Conjugate', 'No transpose', PK, PK, PN,
+ $ ONE, WORK( S2POS ), LDS2,
+ $ WORK( WPOS ), LDW,
+ $ ZERO, WORK( S1POS ), LDS1 )
+*
+ CALL SGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+ $ -HALF, A( I+KD, I ), LDA,
+ $ WORK( S1POS ), LDS1,
+ $ ONE, WORK( WPOS ), LDW )
+*
+*
+* Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+* an update of the form: A := A - V*W' - W*V'
+*
+ CALL SSYR2K( UPLO, 'No transpose', PN, PK,
+ $ -ONE, A( I+KD, I ), LDA,
+ $ WORK( WPOS ), LDW,
+ $ RONE, A( I+KD, I+KD ), LDA )
+* ==================================================================
+* RESTORE A FOR COMPARISON AND CHECKING TO BE REMOVED
+* DO 45 J = I, I+PK-1
+* LK = MIN( KD, N-J ) + 1
+* CALL SCOPY( LK, AB( 1, J ), 1, A( J, J ), 1 )
+* 45 CONTINUE
+* ==================================================================
+ 40 CONTINUE
+*
+* Copy the lower band to AB which is the band storage matrix
+*
+ DO 60 J = N-KD+1, N
+ LK = MIN(KD, N-J) + 1
+ CALL SCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+ 60 CONTINUE
+
+ END IF
+*
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of SSYTRD_SY2SB
+*
+ END
diff --git a/SRC/zhb2st_kernels.f b/SRC/zhb2st_kernels.f
new file mode 100644
index 00000000..ab03b303
--- /dev/null
+++ b/SRC/zhb2st_kernels.f
@@ -0,0 +1,320 @@
+*> \brief \b ZHB2ST_KERNELS
+*
+* @precisions fortran z -> s d c
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHB2ST_KERNELS + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhb2st_kernels.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhb2st_kernels.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhb2st_kernels.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE,
+* ST, ED, SWEEP, N, NB, IB,
+* A, LDA, V, TAU, LDVT, WORK)
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER UPLO
+* LOGICAL WANTZ
+* INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+* ..
+* .. Array Arguments ..
+* COMPLEX*16 A( LDA, * ), V( * ),
+* TAU( * ), WORK( * )
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZHB2ST_KERNELS is an internal routine used by the ZHETRD_HB2ST
+*> subroutine.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> @param[in] n
+*> The order of the matrix A.
+*>
+*> @param[in] nb
+*> The size of the band.
+*>
+*> @param[in, out] A
+*> A pointer to the matrix A.
+*>
+*> @param[in] lda
+*> The leading dimension of the matrix A.
+*>
+*> @param[out] V
+*> COMPLEX*16 array, dimension 2*n if eigenvalues only are
+*> requested or to be queried for vectors.
+*>
+*> @param[out] TAU
+*> COMPLEX*16 array, dimension (2*n).
+*> The scalar factors of the Householder reflectors are stored
+*> in this array.
+*>
+*> @param[in] st
+*> internal parameter for indices.
+*>
+*> @param[in] ed
+*> internal parameter for indices.
+*>
+*> @param[in] sweep
+*> internal parameter for indices.
+*>
+*> @param[in] Vblksiz
+*> internal parameter for indices.
+*>
+*> @param[in] wantz
+*> logical which indicate if Eigenvalue are requested or both
+*> Eigenvalue/Eigenvectors.
+*>
+*> @param[in] work
+*> Workspace of size nb.
+*>
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE,
+ $ ST, ED, SWEEP, N, NB, IB,
+ $ A, LDA, V, TAU, LDVT, WORK)
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER UPLO
+ LOGICAL WANTZ
+ INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+* ..
+* .. Array Arguments ..
+ COMPLEX*16 A( LDA, * ), V( * ),
+ $ TAU( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ),
+ $ ONE = ( 1.0D+0, 0.0D+0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL UPPER
+ INTEGER I, J1, J2, LM, LN, VPOS, TAUPOS,
+ $ DPOS, OFDPOS, AJETER
+ COMPLEX*16 CTMP
+* ..
+* .. External Subroutines ..
+ EXTERNAL ZLARFG, ZLARFX, ZLARFY
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC DCONJG, MOD
+* .. External Functions ..
+ LOGICAL LSAME
+ EXTERNAL LSAME
+* ..
+* ..
+* .. Executable Statements ..
+*
+ AJETER = IB + LDVT
+ UPPER = LSAME( UPLO, 'U' )
+
+ IF( UPPER ) THEN
+ DPOS = 2 * NB + 1
+ OFDPOS = 2 * NB
+ ELSE
+ DPOS = 1
+ OFDPOS = 2
+ ENDIF
+
+*
+* Upper case
+*
+ IF( UPPER ) THEN
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ENDIF
+ GO TO ( 101, 102, 103 ) TTYPE
+*
+ 101 CONTINUE
+ LM = ED - ST + 1
+*
+ V( VPOS ) = ONE
+ DO 10 I = 1, LM-1
+ V( VPOS+I ) = DCONJG( A( OFDPOS-I, ST+I ) )
+ A( OFDPOS-I, ST+I ) = ZERO
+ 10 CONTINUE
+ CTMP = DCONJG( A( OFDPOS, ST ) )
+ CALL ZLARFG( LM, CTMP, V( VPOS+1 ), 1,
+ $ TAU( TAUPOS ) )
+ A( OFDPOS, ST ) = CTMP
+*
+ 103 CONTINUE
+ LM = ED - ST + 1
+ CALL ZLARFY( UPLO, LM, V( VPOS ), 1, DCONJG( TAU( TAUPOS ) ),
+ $ A( DPOS, ST ), LDA-1, WORK)
+ GOTO 300
+*
+ 102 CONTINUE
+ J1 = ED+1
+ J2 = MIN( ED+NB, N )
+ LN = ED-ST+1
+ LM = J2-J1+1
+ IF( LM.GT.0) THEN
+ CALL ZLARFX( 'Left', LN, LM, V( VPOS ),
+ $ DCONJG( TAU( TAUPOS ) ), A( DPOS-NB, J1 ),
+ $ LDA-1, WORK)
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ENDIF
+*
+ V( VPOS ) = ONE
+ DO 30 I = 1, LM-1
+ V( VPOS+I ) = DCONJG( A( DPOS-NB-I, J1+I ) )
+ A( DPOS-NB-I, J1+I ) = ZERO
+ 30 CONTINUE
+ CTMP = DCONJG( A( DPOS-NB, J1 ) )
+ CALL ZLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) )
+ A( DPOS-NB, J1 ) = CTMP
+*
+ CALL ZLARFX( 'Right', LN-1, LM, V( VPOS ),
+ $ TAU( TAUPOS ),
+ $ A( DPOS-NB+1, J1 ), LDA-1, WORK)
+ ENDIF
+ GOTO 300
+*
+* Lower case
+*
+ ELSE
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + ST
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+ ENDIF
+ GO TO ( 201, 202, 203 ) TTYPE
+*
+ 201 CONTINUE
+ LM = ED - ST + 1
+*
+ V( VPOS ) = ONE
+ DO 20 I = 1, LM-1
+ V( VPOS+I ) = A( OFDPOS+I, ST-1 )
+ A( OFDPOS+I, ST-1 ) = ZERO
+ 20 CONTINUE
+ CALL ZLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1,
+ $ TAU( TAUPOS ) )
+*
+ 203 CONTINUE
+ LM = ED - ST + 1
+*
+ CALL ZLARFY( UPLO, LM, V( VPOS ), 1, DCONJG( TAU( TAUPOS ) ),
+ $ A( DPOS, ST ), LDA-1, WORK)
+
+ GOTO 300
+*
+ 202 CONTINUE
+ J1 = ED+1
+ J2 = MIN( ED+NB, N )
+ LN = ED-ST+1
+ LM = J2-J1+1
+*
+ IF( LM.GT.0) THEN
+ CALL ZLARFX( 'Right', LM, LN, V( VPOS ),
+ $ TAU( TAUPOS ), A( DPOS+NB, ST ),
+ $ LDA-1, WORK)
+*
+ IF( WANTZ ) THEN
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ELSE
+ VPOS = MOD( SWEEP-1, 2 ) * N + J1
+ TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+ ENDIF
+*
+ V( VPOS ) = ONE
+ DO 40 I = 1, LM-1
+ V( VPOS+I ) = A( DPOS+NB+I, ST )
+ A( DPOS+NB+I, ST ) = ZERO
+ 40 CONTINUE
+ CALL ZLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1,
+ $ TAU( TAUPOS ) )
+*
+ CALL ZLARFX( 'Left', LM, LN-1, V( VPOS ),
+ $ DCONJG( TAU( TAUPOS ) ),
+ $ A( DPOS+NB-1, ST+1 ), LDA-1, WORK)
+
+ ENDIF
+ GOTO 300
+ ENDIF
+
+ 300 CONTINUE
+ RETURN
+*
+* END OF ZHB2ST_KERNELS
+*
+ END
diff --git a/SRC/zhbev_2stage.f b/SRC/zhbev_2stage.f
new file mode 100644
index 00000000..f1088b87
--- /dev/null
+++ b/SRC/zhbev_2stage.f
@@ -0,0 +1,386 @@
+*> \brief <b> ZHBEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+* @precisions fortran z -> s d c
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHBEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhbev_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhbev_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhbev_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZHBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+* WORK, LWORK, RWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, KD, LDAB, LDZ, N, LWORK
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION RWORK( * ), W( * )
+* COMPLEX*16 AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZHBEV_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a complex Hermitian band matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is COMPLEX*16 array, dimension (LDAB, N)
+*> On entry, the upper or lower triangle of the Hermitian band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*> On exit, AB is overwritten by values generated during the
+*> reduction to tridiagonal form. If UPLO = 'U', the first
+*> superdiagonal and the diagonal of the tridiagonal matrix T
+*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*> the diagonal and first subdiagonal of T are returned in the
+*> first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is COMPLEX*16 array, dimension (LDZ, N)
+*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*> eigenvectors of the matrix A, with the i-th column of Z
+*> holding the eigenvector associated with W(i).
+*> If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX*16 array, dimension LWORK
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS
+*> where KD is the size of the band.
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK, RWORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is DOUBLE PRECISION array, dimension (max(1,3*N-2))
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit.
+*> < 0: if INFO = -i, the i-th argument had an illegal value.
+*> > 0: if INFO = i, the algorithm failed to converge; i
+*> off-diagonal elements of an intermediate tridiagonal
+*> form did not converge to zero.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16OTHEReigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE ZHBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+ $ WORK, LWORK, RWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, KD, LDAB, LDZ, N, LWORK
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION RWORK( * ), W( * )
+ COMPLEX*16 AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, WANTZ, LQUERY
+ INTEGER IINFO, IMAX, INDE, INDWRK, INDRWK, ISCALE,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS
+ DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, ZLANHB
+ EXTERNAL LSAME, DLAMCH, ZLANHB, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL DSCAL, DSTERF, XERBLA, ZLASCL, ZSTEQR
+ $ ZHETRD_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC DBLE, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDAB.LT.KD+1 ) THEN
+ INFO = -6
+ ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -9
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ WORK( 1 ) = LWMIN
+ ELSE
+ IB = ILAENV( 18, 'ZHETRD_HB2ST', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+ LWMIN = LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+ ENDIF
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -11
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHBEV_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ W( 1 ) = DBLE( AB( 1, 1 ) )
+ ELSE
+ W( 1 ) = DBLE( AB( KD+1, 1 ) )
+ END IF
+ IF( WANTZ )
+ $ Z( 1, 1 ) = ONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = ZLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ CALL ZLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ ELSE
+ CALL ZLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ END IF
+ END IF
+*
+* Call ZHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form.
+*
+ INDE = 1
+ INDHOUS = 1
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL ZHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+ $ RWORK( INDE ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call DSTERF. For eigenvectors, call ZSTEQR.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL DSTERF( N, W, RWORK( INDE ), INFO )
+ ELSE
+ INDRWK = INDE + N
+ CALL ZSTEQR( JOBZ, N, W, RWORK( INDE ), Z, LDZ,
+ $ RWORK( INDRWK ), INFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = N
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of ZHBEV_2STAGE
+*
+ END
diff --git a/SRC/zhbevd_2stage.f b/SRC/zhbevd_2stage.f
new file mode 100644
index 00000000..e4daae74
--- /dev/null
+++ b/SRC/zhbevd_2stage.f
@@ -0,0 +1,458 @@
+*> \brief <b> ZHBEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+* @precisions fortran z -> s d c
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHBEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhbevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhbevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhbevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZHBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+* WORK, LWORK, RWORK, LRWORK, IWORK,
+* LIWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, KD, LDAB, LDZ, LIWORK, LRWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+* INTEGER IWORK( * )
+* DOUBLE PRECISION RWORK( * ), W( * )
+* COMPLEX*16 AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZHBEVD_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a complex Hermitian band matrix A using the 2stage technique for
+*> the reduction to tridiagonal. If eigenvectors are desired, it
+*> uses a divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is COMPLEX*16 array, dimension (LDAB, N)
+*> On entry, the upper or lower triangle of the Hermitian band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*> On exit, AB is overwritten by values generated during the
+*> reduction to tridiagonal form. If UPLO = 'U', the first
+*> superdiagonal and the diagonal of the tridiagonal matrix T
+*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*> the diagonal and first subdiagonal of T are returned in the
+*> first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is COMPLEX*16 array, dimension (LDZ, N)
+*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*> eigenvectors of the matrix A, with the i-th column of Z
+*> holding the eigenvector associated with W(i).
+*> If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS
+*> where KD is the size of the band.
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK, RWORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is DOUBLE PRECISION array,
+*> dimension (LRWORK)
+*> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*> LRWORK is INTEGER
+*> The dimension of array RWORK.
+*> If N <= 1, LRWORK must be at least 1.
+*> If JOBZ = 'N' and N > 1, LRWORK must be at least N.
+*> If JOBZ = 'V' and N > 1, LRWORK must be at least
+*> 1 + 5*N + 2*N**2.
+*>
+*> If LRWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK, RWORK
+*> and IWORK arrays, returns these values as the first entries
+*> of the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*> LIWORK is INTEGER
+*> The dimension of array IWORK.
+*> If JOBZ = 'N' or N <= 1, LIWORK must be at least 1.
+*> If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N .
+*>
+*> If LIWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK, RWORK
+*> and IWORK arrays, returns these values as the first entries
+*> of the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit.
+*> < 0: if INFO = -i, the i-th argument had an illegal value.
+*> > 0: if INFO = i, the algorithm failed to converge; i
+*> off-diagonal elements of an intermediate tridiagonal
+*> form did not converge to zero.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16OTHEReigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE ZHBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+ $ WORK, LWORK, RWORK, LRWORK, IWORK,
+ $ LIWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, KD, LDAB, LDZ, LIWORK, LRWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+ INTEGER IWORK( * )
+ DOUBLE PRECISION RWORK( * ), W( * )
+ COMPLEX*16 AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+ COMPLEX*16 CZERO, CONE
+ PARAMETER ( CZERO = ( 0.0D0, 0.0D0 ),
+ $ CONE = ( 1.0D0, 0.0D0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, LQUERY, WANTZ
+ INTEGER IINFO, IMAX, INDE, INDWK2, INDRWK, ISCALE,
+ $ LLWORK, INDWK, LHTRD, LWTRD, IB, INDHOUS,
+ $ LIWMIN, LLRWK, LLWK2, LRWMIN, LWMIN
+ DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, ZLANHB
+ EXTERNAL LSAME, DLAMCH, ZLANHB, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL DSCAL, DSTERF, XERBLA, ZGEMM, ZLACPY,
+ $ ZLASCL, ZSTEDC, ZHETRD_HB2ST
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC DBLE, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 .OR. LRWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ LRWMIN = 1
+ LIWMIN = 1
+ ELSE
+ IB = ILAENV( 18, 'ZHETRD_HB2ST', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+ IF( WANTZ ) THEN
+ LWMIN = 2*N**2
+ LRWMIN = 1 + 5*N + 2*N**2
+ LIWMIN = 3 + 5*N
+ ELSE
+ LWMIN = MAX( N, LHTRD + LWTRD )
+ LRWMIN = N
+ LIWMIN = 1
+ END IF
+ END IF
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDAB.LT.KD+1 ) THEN
+ INFO = -6
+ ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -9
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ WORK( 1 ) = LWMIN
+ RWORK( 1 ) = LRWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -11
+ ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -13
+ ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -15
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHBEVD_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ W( 1 ) = DBLE( AB( 1, 1 ) )
+ IF( WANTZ )
+ $ Z( 1, 1 ) = CONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = ZLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ CALL ZLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ ELSE
+ CALL ZLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ END IF
+ END IF
+*
+* Call ZHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form.
+*
+ INDE = 1
+ INDRWK = INDE + N
+ LLRWK = LRWORK - INDRWK + 1
+ INDHOUS = 1
+ INDWK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWK + 1
+ INDWK2 = INDWK + N*N
+ LLWK2 = LWORK - INDWK2 + 1
+*
+ CALL ZHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+ $ RWORK( INDE ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call DSTERF. For eigenvectors, call ZSTEDC.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL DSTERF( N, W, RWORK( INDE ), INFO )
+ ELSE
+ CALL ZSTEDC( 'I', N, W, RWORK( INDE ), WORK, N, WORK( INDWK2 ),
+ $ LLWK2, RWORK( INDRWK ), LLRWK, IWORK, LIWORK,
+ $ INFO )
+ CALL ZGEMM( 'N', 'N', N, N, N, CONE, Z, LDZ, WORK, N, CZERO,
+ $ WORK( INDWK2 ), N )
+ CALL ZLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = N
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+ WORK( 1 ) = LWMIN
+ RWORK( 1 ) = LRWMIN
+ IWORK( 1 ) = LIWMIN
+ RETURN
+*
+* End of ZHBEVD_2STAGE
+*
+ END
diff --git a/SRC/zhbevx_2stage.f b/SRC/zhbevx_2stage.f
new file mode 100644
index 00000000..3efdcc74
--- /dev/null
+++ b/SRC/zhbevx_2stage.f
@@ -0,0 +1,646 @@
+*> \brief <b> ZHBEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+* @precisions fortran z -> s d c
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHBEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhbevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhbevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhbevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB,
+* Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W,
+* Z, LDZ, WORK, LWORK, RWORK, IWORK,
+* IFAIL, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, RANGE, UPLO
+* INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+* DOUBLE PRECISION ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+* INTEGER IFAIL( * ), IWORK( * )
+* DOUBLE PRECISION RWORK( * ), W( * )
+* COMPLEX*16 AB( LDAB, * ), Q( LDQ, * ), WORK( * ),
+* $ Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZHBEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a complex Hermitian band matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors
+*> can be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*> RANGE is CHARACTER*1
+*> = 'A': all eigenvalues will be found;
+*> = 'V': all eigenvalues in the half-open interval (VL,VU]
+*> will be found;
+*> = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is COMPLEX*16 array, dimension (LDAB, N)
+*> On entry, the upper or lower triangle of the Hermitian band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*> On exit, AB is overwritten by values generated during the
+*> reduction to tridiagonal form.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] Q
+*> \verbatim
+*> Q is COMPLEX*16 array, dimension (LDQ, N)
+*> If JOBZ = 'V', the N-by-N unitary matrix used in the
+*> reduction to tridiagonal form.
+*> If JOBZ = 'N', the array Q is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDQ
+*> \verbatim
+*> LDQ is INTEGER
+*> The leading dimension of the array Q. If JOBZ = 'V', then
+*> LDQ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*> VL is DOUBLE PRECISION
+*> If RANGE='V', the lower bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*> VU is DOUBLE PRECISION
+*> If RANGE='V', the upper bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*> IL is INTEGER
+*> If RANGE='I', the index of the
+*> smallest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*> IU is INTEGER
+*> If RANGE='I', the index of the
+*> largest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*> ABSTOL is DOUBLE PRECISION
+*> The absolute error tolerance for the eigenvalues.
+*> An approximate eigenvalue is accepted as converged
+*> when it is determined to lie in an interval [a,b]
+*> of width less than or equal to
+*>
+*> ABSTOL + EPS * max( |a|,|b| ) ,
+*>
+*> where EPS is the machine precision. If ABSTOL is less than
+*> or equal to zero, then EPS*|T| will be used in its place,
+*> where |T| is the 1-norm of the tridiagonal matrix obtained
+*> by reducing AB to tridiagonal form.
+*>
+*> Eigenvalues will be computed most accurately when ABSTOL is
+*> set to twice the underflow threshold 2*DLAMCH('S'), not zero.
+*> If this routine returns with INFO>0, indicating that some
+*> eigenvectors did not converge, try setting ABSTOL to
+*> 2*DLAMCH('S').
+*>
+*> See "Computing Small Singular Values of Bidiagonal Matrices
+*> with Guaranteed High Relative Accuracy," by Demmel and
+*> Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*> M is INTEGER
+*> The total number of eigenvalues found. 0 <= M <= N.
+*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> The first M elements contain the selected eigenvalues in
+*> ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is COMPLEX*16 array, dimension (LDZ, max(1,M))
+*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*> contain the orthonormal eigenvectors of the matrix A
+*> corresponding to the selected eigenvalues, with the i-th
+*> column of Z holding the eigenvector associated with W(i).
+*> If an eigenvector fails to converge, then that column of Z
+*> contains the latest approximation to the eigenvector, and the
+*> index of the eigenvector is returned in IFAIL.
+*> If JOBZ = 'N', then Z is not referenced.
+*> Note: the user must ensure that at least max(1,M) columns are
+*> supplied in the array Z; if RANGE = 'V', the exact value of M
+*> is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX*16 array, dimension (LWORK)
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS
+*> where KD is the size of the band.
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK, RWORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is DOUBLE PRECISION array, dimension (7*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*> IFAIL is INTEGER array, dimension (N)
+*> If JOBZ = 'V', then if INFO = 0, the first M elements of
+*> IFAIL are zero. If INFO > 0, then IFAIL contains the
+*> indices of the eigenvectors that failed to converge.
+*> If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, then i eigenvectors failed to converge.
+*> Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup complex16OTHEReigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE ZHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB,
+ $ Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W,
+ $ Z, LDZ, WORK, LWORK, RWORK, IWORK,
+ $ IFAIL, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, RANGE, UPLO
+ INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+ DOUBLE PRECISION ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+ INTEGER IFAIL( * ), IWORK( * )
+ DOUBLE PRECISION RWORK( * ), W( * )
+ COMPLEX*16 AB( LDAB, * ), Q( LDQ, * ), WORK( * ),
+ $ Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+ COMPLEX*16 CZERO, CONE
+ PARAMETER ( CZERO = ( 0.0D0, 0.0D0 ),
+ $ CONE = ( 1.0D0, 0.0D0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL ALLEIG, INDEIG, LOWER, TEST, VALEIG, WANTZ,
+ $ LQUERY
+ CHARACTER ORDER
+ INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+ $ INDISP, INDIWK, INDRWK, INDWRK, ISCALE, ITMP1,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS,
+ $ J, JJ, NSPLIT
+ DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+ $ SIGMA, SMLNUM, TMP1, VLL, VUU
+ COMPLEX*16 CTMP1
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, ZLANHB
+ EXTERNAL LSAME, DLAMCH, ZLANHB, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL DCOPY, DSCAL, DSTEBZ, DSTERF, XERBLA, ZCOPY,
+ $ ZGEMV, ZLACPY, ZLASCL, ZSTEIN, ZSTEQR,
+ $ ZSWAP, ZHETRD_HB2ST
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC DBLE, MAX, MIN, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ ALLEIG = LSAME( RANGE, 'A' )
+ VALEIG = LSAME( RANGE, 'V' )
+ INDEIG = LSAME( RANGE, 'I' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -5
+ ELSE IF( LDAB.LT.KD+1 ) THEN
+ INFO = -7
+ ELSE IF( WANTZ .AND. LDQ.LT.MAX( 1, N ) ) THEN
+ INFO = -9
+ ELSE
+ IF( VALEIG ) THEN
+ IF( N.GT.0 .AND. VU.LE.VL )
+ $ INFO = -11
+ ELSE IF( INDEIG ) THEN
+ IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+ INFO = -12
+ ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+ INFO = -13
+ END IF
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) )
+ $ INFO = -18
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ WORK( 1 ) = LWMIN
+ ELSE
+ IB = ILAENV( 18, 'ZHETRD_HB2ST', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+ LWMIN = LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+ ENDIF
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -20
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHBEVX_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ M = 0
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ M = 1
+ IF( LOWER ) THEN
+ CTMP1 = AB( 1, 1 )
+ ELSE
+ CTMP1 = AB( KD+1, 1 )
+ END IF
+ TMP1 = DBLE( CTMP1 )
+ IF( VALEIG ) THEN
+ IF( .NOT.( VL.LT.TMP1 .AND. VU.GE.TMP1 ) )
+ $ M = 0
+ END IF
+ IF( M.EQ.1 ) THEN
+ W( 1 ) = DBLE( CTMP1 )
+ IF( WANTZ )
+ $ Z( 1, 1 ) = CONE
+ END IF
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ISCALE = 0
+ ABSTLL = ABSTOL
+ IF( VALEIG ) THEN
+ VLL = VL
+ VUU = VU
+ ELSE
+ VLL = ZERO
+ VUU = ZERO
+ END IF
+ ANRM = ZLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK )
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ CALL ZLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ ELSE
+ CALL ZLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+ END IF
+ IF( ABSTOL.GT.0 )
+ $ ABSTLL = ABSTOL*SIGMA
+ IF( VALEIG ) THEN
+ VLL = VL*SIGMA
+ VUU = VU*SIGMA
+ END IF
+ END IF
+*
+* Call ZHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form.
+*
+ INDD = 1
+ INDE = INDD + N
+ INDRWK = INDE + N
+*
+ INDHOUS = 1
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL ZHETRD_HB2ST( 'N', JOBZ, UPLO, N, KD, AB, LDAB,
+ $ RWORK( INDD ), RWORK( INDE ), WORK( INDHOUS ),
+ $ LHTRD, WORK( INDWRK ), LLWORK, IINFO )
+*
+* If all eigenvalues are desired and ABSTOL is less than or equal
+* to zero, then call DSTERF or ZSTEQR. If this fails for some
+* eigenvalue, then try DSTEBZ.
+*
+ TEST = .FALSE.
+ IF (INDEIG) THEN
+ IF (IL.EQ.1 .AND. IU.EQ.N) THEN
+ TEST = .TRUE.
+ END IF
+ END IF
+ IF ((ALLEIG .OR. TEST) .AND. (ABSTOL.LE.ZERO)) THEN
+ CALL DCOPY( N, RWORK( INDD ), 1, W, 1 )
+ INDEE = INDRWK + 2*N
+ IF( .NOT.WANTZ ) THEN
+ CALL DCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+ CALL DSTERF( N, W, RWORK( INDEE ), INFO )
+ ELSE
+ CALL ZLACPY( 'A', N, N, Q, LDQ, Z, LDZ )
+ CALL DCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+ CALL ZSTEQR( JOBZ, N, W, RWORK( INDEE ), Z, LDZ,
+ $ RWORK( INDRWK ), INFO )
+ IF( INFO.EQ.0 ) THEN
+ DO 10 I = 1, N
+ IFAIL( I ) = 0
+ 10 CONTINUE
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ M = N
+ GO TO 30
+ END IF
+ INFO = 0
+ END IF
+*
+* Otherwise, call DSTEBZ and, if eigenvectors are desired, ZSTEIN.
+*
+ IF( WANTZ ) THEN
+ ORDER = 'B'
+ ELSE
+ ORDER = 'E'
+ END IF
+ INDIBL = 1
+ INDISP = INDIBL + N
+ INDIWK = INDISP + N
+ CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+ $ RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ),
+ $ IWORK( INDIWK ), INFO )
+*
+ IF( WANTZ ) THEN
+ CALL ZSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+ $ RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO )
+*
+* Apply unitary matrix used in reduction to tridiagonal
+* form to eigenvectors returned by ZSTEIN.
+*
+ DO 20 J = 1, M
+ CALL ZCOPY( N, Z( 1, J ), 1, WORK( 1 ), 1 )
+ CALL ZGEMV( 'N', N, N, CONE, Q, LDQ, WORK, 1, CZERO,
+ $ Z( 1, J ), 1 )
+ 20 CONTINUE
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ 30 CONTINUE
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = M
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* If eigenvalues are not in order, then sort them, along with
+* eigenvectors.
+*
+ IF( WANTZ ) THEN
+ DO 50 J = 1, M - 1
+ I = 0
+ TMP1 = W( J )
+ DO 40 JJ = J + 1, M
+ IF( W( JJ ).LT.TMP1 ) THEN
+ I = JJ
+ TMP1 = W( JJ )
+ END IF
+ 40 CONTINUE
+*
+ IF( I.NE.0 ) THEN
+ ITMP1 = IWORK( INDIBL+I-1 )
+ W( I ) = W( J )
+ IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+ W( J ) = TMP1
+ IWORK( INDIBL+J-1 ) = ITMP1
+ CALL ZSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+ IF( INFO.NE.0 ) THEN
+ ITMP1 = IFAIL( I )
+ IFAIL( I ) = IFAIL( J )
+ IFAIL( J ) = ITMP1
+ END IF
+ END IF
+ 50 CONTINUE
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of ZHBEVX_2STAGE
+*
+ END
diff --git a/SRC/zheev_2stage.f b/SRC/zheev_2stage.f
new file mode 100644
index 00000000..5aca4da2
--- /dev/null
+++ b/SRC/zheev_2stage.f
@@ -0,0 +1,355 @@
+*> \brief <b> ZHEEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+* @precisions fortran z -> s d c
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHEEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zheev_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zheev_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zheev_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+* RWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, LDA, LWORK, N
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION RWORK( * ), W( * )
+* COMPLEX*16 A( LDA, * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZHEEV_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX*16 array, dimension (LDA, N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*> orthonormal eigenvectors of the matrix A.
+*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*> or the upper triangle (if UPLO='U') of A, including the
+*> diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is DOUBLE PRECISION array, dimension (max(1, 3*N-2))
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, the algorithm failed to converge; i
+*> off-diagonal elements of an intermediate tridiagonal
+*> form did not converge to zero.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16HEeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE ZHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+ $ RWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, LDA, LWORK, N
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION RWORK( * ), W( * )
+ COMPLEX*16 A( LDA, * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+ COMPLEX*16 CONE
+ PARAMETER ( CONE = ( 1.0D0, 0.0D0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, LQUERY, WANTZ
+ INTEGER IINFO, IMAX, INDE, INDTAU, INDWRK, ISCALE,
+ $ LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+ DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, ZLANHE
+ EXTERNAL LSAME, ILAENV, DLAMCH, ZLANHE
+* ..
+* .. External Subroutines ..
+ EXTERNAL DSCAL, DSTERF, XERBLA, ZLASCL, ZSTEQR,
+ $ ZUNGTR, ZHETRD_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC DBLE, MAX, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ KD = ILAENV( 17, 'ZHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -8
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHEEV_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 ) THEN
+ RETURN
+ END IF
+*
+ IF( N.EQ.1 ) THEN
+ W( 1 ) = DBLE( A( 1, 1 ) )
+ WORK( 1 ) = 1
+ IF( WANTZ )
+ $ A( 1, 1 ) = CONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = ZLANHE( 'M', UPLO, N, A, LDA, RWORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 )
+ $ CALL ZLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+* Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+ INDE = 1
+ INDTAU = 1
+ INDHOUS = INDTAU + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ),
+ $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call DSTERF. For eigenvectors, first call
+* ZUNGTR to generate the unitary matrix, then call ZSTEQR.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL DSTERF( N, W, RWORK( INDE ), INFO )
+ ELSE
+ CALL ZUNGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ),
+ $ LLWORK, IINFO )
+ INDWRK = INDE + N
+ CALL ZSTEQR( JOBZ, N, W, RWORK( INDE ), A, LDA,
+ $ RWORK( INDWRK ), INFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = N
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* Set WORK(1) to optimal complex workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of ZHEEV_2STAGE
+*
+ END
diff --git a/SRC/zheevd_2stage.f b/SRC/zheevd_2stage.f
new file mode 100644
index 00000000..79a0e886
--- /dev/null
+++ b/SRC/zheevd_2stage.f
@@ -0,0 +1,451 @@
+*> \brief <b> ZHEEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+* @precisions fortran z -> s d c
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHEEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zheevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zheevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zheevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZHEEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+* RWORK, LRWORK, IWORK, LIWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, LDA, LIWORK, LRWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+* INTEGER IWORK( * )
+* DOUBLE PRECISION RWORK( * ), W( * )
+* COMPLEX*16 A( LDA, * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal. If eigenvectors are desired, it uses a
+*> divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX*16 array, dimension (LDA, N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*> orthonormal eigenvectors of the matrix A.
+*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*> or the upper triangle (if UPLO='U') of A, including the
+*> diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK.
+*> If N <= 1, LWORK must be at least 1.
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + N+1
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + N+1
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be at least 2*N + N**2
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK, RWORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is DOUBLE PRECISION array,
+*> dimension (LRWORK)
+*> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*> LRWORK is INTEGER
+*> The dimension of the array RWORK.
+*> If N <= 1, LRWORK must be at least 1.
+*> If JOBZ = 'N' and N > 1, LRWORK must be at least N.
+*> If JOBZ = 'V' and N > 1, LRWORK must be at least
+*> 1 + 5*N + 2*N**2.
+*>
+*> If LRWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK, RWORK
+*> and IWORK arrays, returns these values as the first entries
+*> of the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*> LIWORK is INTEGER
+*> The dimension of the array IWORK.
+*> If N <= 1, LIWORK must be at least 1.
+*> If JOBZ = 'N' and N > 1, LIWORK must be at least 1.
+*> If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
+*>
+*> If LIWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK, RWORK
+*> and IWORK arrays, returns these values as the first entries
+*> of the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i and JOBZ = 'N', then the algorithm failed
+*> to converge; i off-diagonal elements of an intermediate
+*> tridiagonal form did not converge to zero;
+*> if INFO = i and JOBZ = 'V', then the algorithm failed
+*> to compute an eigenvalue while working on the submatrix
+*> lying in rows and columns INFO/(N+1) through
+*> mod(INFO,N+1).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16HEeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> Modified description of INFO. Sven, 16 Feb 05.
+*
+*> \par Contributors:
+* ==================
+*>
+*> Jeff Rutter, Computer Science Division, University of California
+*> at Berkeley, USA
+*>
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE ZHEEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+ $ RWORK, LRWORK, IWORK, LIWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, LDA, LIWORK, LRWORK, LWORK, N
+* ..
+* .. Array Arguments ..
+ INTEGER IWORK( * )
+ DOUBLE PRECISION RWORK( * ), W( * )
+ COMPLEX*16 A( LDA, * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+ COMPLEX*16 CONE
+ PARAMETER ( CONE = ( 1.0D0, 0.0D0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL LOWER, LQUERY, WANTZ
+ INTEGER IINFO, IMAX, INDE, INDRWK, INDTAU, INDWK2,
+ $ INDWRK, ISCALE, LIWMIN, LLRWK, LLWORK,
+ $ LLWRK2, LRWMIN, LWMIN,
+ $ LHTRD, LWTRD, KD, IB, INDHOUS
+
+
+ DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+ $ SMLNUM
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, ZLANHE
+ EXTERNAL LSAME, ILAENV, DLAMCH, ZLANHE
+* ..
+* .. External Subroutines ..
+ EXTERNAL DSCAL, DSTERF, XERBLA, ZLACPY, ZLASCL,
+ $ ZSTEDC, ZUNMTR, ZHETRD_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC DBLE, MAX, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ LOWER = LSAME( UPLO, 'L' )
+ LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 .OR. LIWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ LRWMIN = 1
+ LIWMIN = 1
+ ELSE
+ KD = ILAENV( 17, 'ZHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ IF( WANTZ ) THEN
+ LWMIN = 2*N + N*N
+ LRWMIN = 1 + 5*N + 2*N**2
+ LIWMIN = 3 + 5*N
+ ELSE
+ LWMIN = N + 1 + LHTRD + LWTRD
+ LRWMIN = N
+ LIWMIN = 1
+ END IF
+ END IF
+ WORK( 1 ) = LWMIN
+ RWORK( 1 ) = LRWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -8
+ ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -10
+ ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -12
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHEEVD_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+ IF( N.EQ.1 ) THEN
+ W( 1 ) = DBLE( A( 1, 1 ) )
+ IF( WANTZ )
+ $ A( 1, 1 ) = CONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = SQRT( BIGNUM )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ANRM = ZLANHE( 'M', UPLO, N, A, LDA, RWORK )
+ ISCALE = 0
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 )
+ $ CALL ZLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+* Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+ INDE = 1
+ INDRWK = INDE + N
+ LLRWK = LRWORK - INDRWK + 1
+ INDTAU = 1
+ INDHOUS = INDTAU + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+ INDWK2 = INDWRK + N*N
+ LLWRK2 = LWORK - INDWK2 + 1
+*
+ CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ),
+ $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWRK ), LLWORK, IINFO )
+*
+* For eigenvalues only, call DSTERF. For eigenvectors, first call
+* ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
+* tridiagonal matrix, then call ZUNMTR to multiply it to the
+* Householder transformations represented as Householder vectors in
+* A.
+*
+ IF( .NOT.WANTZ ) THEN
+ CALL DSTERF( N, W, RWORK( INDE ), INFO )
+ ELSE
+ CALL ZSTEDC( 'I', N, W, RWORK( INDE ), WORK( INDWRK ), N,
+ $ WORK( INDWK2 ), LLWRK2, RWORK( INDRWK ), LLRWK,
+ $ IWORK, LIWORK, INFO )
+ CALL ZUNMTR( 'L', UPLO, 'N', N, N, A, LDA, WORK( INDTAU ),
+ $ WORK( INDWRK ), N, WORK( INDWK2 ), LLWRK2, IINFO )
+ CALL ZLACPY( 'A', N, N, WORK( INDWRK ), N, A, LDA )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = N
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+ WORK( 1 ) = LWMIN
+ RWORK( 1 ) = LRWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ RETURN
+*
+* End of ZHEEVD_2STAGE
+*
+ END
diff --git a/SRC/zheevr_2stage.f b/SRC/zheevr_2stage.f
new file mode 100644
index 00000000..bfd43056
--- /dev/null
+++ b/SRC/zheevr_2stage.f
@@ -0,0 +1,779 @@
+*> \brief <b> ZHEEVR_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+* @precisions fortran z -> s d c
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHEEVR_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zheevr_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zheevr_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zheevr_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZHEEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+* IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ,
+* WORK, LWORK, RWORK, LRWORK, IWORK,
+* LIWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, RANGE, UPLO
+* INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LRWORK, LWORK,
+* $ M, N
+* DOUBLE PRECISION ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+* INTEGER ISUPPZ( * ), IWORK( * )
+* DOUBLE PRECISION RWORK( * ), W( * )
+* COMPLEX*16 A( LDA, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZHEEVR_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors can
+*> be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*>
+*> ZHEEVR_2STAGE first reduces the matrix A to tridiagonal form T with a call
+*> to ZHETRD. Then, whenever possible, ZHEEVR_2STAGE calls ZSTEMR to compute
+*> eigenspectrum using Relatively Robust Representations. ZSTEMR
+*> computes eigenvalues by the dqds algorithm, while orthogonal
+*> eigenvectors are computed from various "good" L D L^T representations
+*> (also known as Relatively Robust Representations). Gram-Schmidt
+*> orthogonalization is avoided as far as possible. More specifically,
+*> the various steps of the algorithm are as follows.
+*>
+*> For each unreduced block (submatrix) of T,
+*> (a) Compute T - sigma I = L D L^T, so that L and D
+*> define all the wanted eigenvalues to high relative accuracy.
+*> This means that small relative changes in the entries of D and L
+*> cause only small relative changes in the eigenvalues and
+*> eigenvectors. The standard (unfactored) representation of the
+*> tridiagonal matrix T does not have this property in general.
+*> (b) Compute the eigenvalues to suitable accuracy.
+*> If the eigenvectors are desired, the algorithm attains full
+*> accuracy of the computed eigenvalues only right before
+*> the corresponding vectors have to be computed, see steps c) and d).
+*> (c) For each cluster of close eigenvalues, select a new
+*> shift close to the cluster, find a new factorization, and refine
+*> the shifted eigenvalues to suitable accuracy.
+*> (d) For each eigenvalue with a large enough relative separation compute
+*> the corresponding eigenvector by forming a rank revealing twisted
+*> factorization. Go back to (c) for any clusters that remain.
+*>
+*> The desired accuracy of the output can be specified by the input
+*> parameter ABSTOL.
+*>
+*> For more details, see DSTEMR's documentation and:
+*> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+*> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+*> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+*> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+*> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+*> 2004. Also LAPACK Working Note 154.
+*> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+*> tridiagonal eigenvalue/eigenvector problem",
+*> Computer Science Division Technical Report No. UCB/CSD-97-971,
+*> UC Berkeley, May 1997.
+*>
+*>
+*> Note 1 : ZHEEVR_2STAGE calls ZSTEMR when the full spectrum is requested
+*> on machines which conform to the ieee-754 floating point standard.
+*> ZHEEVR_2STAGE calls DSTEBZ and ZSTEIN on non-ieee machines and
+*> when partial spectrum requests are made.
+*>
+*> Normal execution of ZSTEMR may create NaNs and infinities and
+*> hence may abort due to a floating point exception in environments
+*> which do not handle NaNs and infinities in the ieee standard default
+*> manner.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*> RANGE is CHARACTER*1
+*> = 'A': all eigenvalues will be found.
+*> = 'V': all eigenvalues in the half-open interval (VL,VU]
+*> will be found.
+*> = 'I': the IL-th through IU-th eigenvalues will be found.
+*> For RANGE = 'V' or 'I' and IU - IL < N - 1, DSTEBZ and
+*> ZSTEIN are called
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX*16 array, dimension (LDA, N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, the lower triangle (if UPLO='L') or the upper
+*> triangle (if UPLO='U') of A, including the diagonal, is
+*> destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*> VL is DOUBLE PRECISION
+*> If RANGE='V', the lower bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*> VU is DOUBLE PRECISION
+*> If RANGE='V', the upper bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*> IL is INTEGER
+*> If RANGE='I', the index of the
+*> smallest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*> IU is INTEGER
+*> If RANGE='I', the index of the
+*> largest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*> ABSTOL is DOUBLE PRECISION
+*> The absolute error tolerance for the eigenvalues.
+*> An approximate eigenvalue is accepted as converged
+*> when it is determined to lie in an interval [a,b]
+*> of width less than or equal to
+*>
+*> ABSTOL + EPS * max( |a|,|b| ) ,
+*>
+*> where EPS is the machine precision. If ABSTOL is less than
+*> or equal to zero, then EPS*|T| will be used in its place,
+*> where |T| is the 1-norm of the tridiagonal matrix obtained
+*> by reducing A to tridiagonal form.
+*>
+*> See "Computing Small Singular Values of Bidiagonal Matrices
+*> with Guaranteed High Relative Accuracy," by Demmel and
+*> Kahan, LAPACK Working Note #3.
+*>
+*> If high relative accuracy is important, set ABSTOL to
+*> DLAMCH( 'Safe minimum' ). Doing so will guarantee that
+*> eigenvalues are computed to high relative accuracy when
+*> possible in future releases. The current code does not
+*> make any guarantees about high relative accuracy, but
+*> furutre releases will. See J. Barlow and J. Demmel,
+*> "Computing Accurate Eigensystems of Scaled Diagonally
+*> Dominant Matrices", LAPACK Working Note #7, for a discussion
+*> of which matrices define their eigenvalues to high relative
+*> accuracy.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*> M is INTEGER
+*> The total number of eigenvalues found. 0 <= M <= N.
+*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> The first M elements contain the selected eigenvalues in
+*> ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is COMPLEX*16 array, dimension (LDZ, max(1,M))
+*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*> contain the orthonormal eigenvectors of the matrix A
+*> corresponding to the selected eigenvalues, with the i-th
+*> column of Z holding the eigenvector associated with W(i).
+*> If JOBZ = 'N', then Z is not referenced.
+*> Note: the user must ensure that at least max(1,M) columns are
+*> supplied in the array Z; if RANGE = 'V', the exact value of M
+*> is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] ISUPPZ
+*> \verbatim
+*> ISUPPZ is INTEGER array, dimension ( 2*max(1,M) )
+*> The support of the eigenvectors in Z, i.e., the indices
+*> indicating the nonzero elements in Z. The i-th eigenvector
+*> is nonzero only in elements ISUPPZ( 2*i-1 ) through
+*> ISUPPZ( 2*i ). This is an output of ZSTEMR (tridiagonal
+*> matrix). The support of the eigenvectors of A is typically
+*> 1:N because of the unitary transformations applied by ZUNMTR.
+*> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK.
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, 26*N, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal sizes of the WORK, RWORK and
+*> IWORK arrays, returns these values as the first entries of
+*> the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK))
+*> On exit, if INFO = 0, RWORK(1) returns the optimal
+*> (and minimal) LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*> LRWORK is INTEGER
+*> The length of the array RWORK. LRWORK >= max(1,24*N).
+*>
+*> If LRWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK, RWORK
+*> and IWORK arrays, returns these values as the first entries
+*> of the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*> On exit, if INFO = 0, IWORK(1) returns the optimal
+*> (and minimal) LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*> LIWORK is INTEGER
+*> The dimension of the array IWORK. LIWORK >= max(1,10*N).
+*>
+*> If LIWORK = -1, then a workspace query is assumed; the
+*> routine only calculates the optimal sizes of the WORK, RWORK
+*> and IWORK arrays, returns these values as the first entries
+*> of the WORK, RWORK and IWORK arrays, and no error message
+*> related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: Internal error
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup complex16HEeigen
+*
+*> \par Contributors:
+* ==================
+*>
+*> Inderjit Dhillon, IBM Almaden, USA \n
+*> Osni Marques, LBNL/NERSC, USA \n
+*> Ken Stanley, Computer Science Division, University of
+*> California at Berkeley, USA \n
+*> Jason Riedy, Computer Science Division, University of
+*> California at Berkeley, USA \n
+*>
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE ZHEEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+ $ IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ,
+ $ WORK, LWORK, RWORK, LRWORK, IWORK,
+ $ LIWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, RANGE, UPLO
+ INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LRWORK, LWORK,
+ $ M, N
+ DOUBLE PRECISION ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+ INTEGER ISUPPZ( * ), IWORK( * )
+ DOUBLE PRECISION RWORK( * ), W( * )
+ COMPLEX*16 A( LDA, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE, TWO
+ PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 )
+* ..
+* .. Local Scalars ..
+ LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG,
+ $ WANTZ, TRYRAC
+ CHARACTER ORDER
+ INTEGER I, IEEEOK, IINFO, IMAX, INDIBL, INDIFL, INDISP,
+ $ INDIWO, INDRD, INDRDD, INDRE, INDREE, INDRWK,
+ $ INDTAU, INDWK, INDWKN, ISCALE, ITMP1, J, JJ,
+ $ LIWMIN, LLWORK, LLRWORK, LLWRKN, LRWMIN,
+ $ LWMIN, NSPLIT, LHTRD, LWTRD, KD, IB, INDHOUS
+ DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+ $ SIGMA, SMLNUM, TMP1, VLL, VUU
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, ZLANSY
+ EXTERNAL LSAME, ILAENV, DLAMCH, ZLANSY
+* ..
+* .. External Subroutines ..
+ EXTERNAL DCOPY, DSCAL, DSTEBZ, DSTERF, XERBLA, ZDSCAL,
+ $ ZHETRD_2STAGE, ZSTEMR, ZSTEIN, ZSWAP, ZUNMTR
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC DBLE, MAX, MIN, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ IEEEOK = ILAENV( 10, 'ZHEEVR', 'N', 1, 2, 3, 4 )
+*
+ LOWER = LSAME( UPLO, 'L' )
+ WANTZ = LSAME( JOBZ, 'V' )
+ ALLEIG = LSAME( RANGE, 'A' )
+ VALEIG = LSAME( RANGE, 'V' )
+ INDEIG = LSAME( RANGE, 'I' )
+*
+ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR.
+ $ ( LIWORK.EQ.-1 ) )
+*
+ KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = N + LHTRD + LWTRD
+ LRWMIN = MAX( 1, 24*N )
+ LIWMIN = MAX( 1, 10*N )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -6
+ ELSE
+ IF( VALEIG ) THEN
+ IF( N.GT.0 .AND. VU.LE.VL )
+ $ INFO = -8
+ ELSE IF( INDEIG ) THEN
+ IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+ INFO = -9
+ ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+ INFO = -10
+ END IF
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -15
+ END IF
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ WORK( 1 ) = LWMIN
+ RWORK( 1 ) = LRWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -18
+ ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -20
+ ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -22
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHEEVR_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ M = 0
+ IF( N.EQ.0 ) THEN
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+ IF( N.EQ.1 ) THEN
+ WORK( 1 ) = 2
+ IF( ALLEIG .OR. INDEIG ) THEN
+ M = 1
+ W( 1 ) = DBLE( A( 1, 1 ) )
+ ELSE
+ IF( VL.LT.DBLE( A( 1, 1 ) ) .AND. VU.GE.DBLE( A( 1, 1 ) ) )
+ $ THEN
+ M = 1
+ W( 1 ) = DBLE( A( 1, 1 ) )
+ END IF
+ END IF
+ IF( WANTZ ) THEN
+ Z( 1, 1 ) = ONE
+ ISUPPZ( 1 ) = 1
+ ISUPPZ( 2 ) = 1
+ END IF
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ISCALE = 0
+ ABSTLL = ABSTOL
+ IF (VALEIG) THEN
+ VLL = VL
+ VUU = VU
+ END IF
+ ANRM = ZLANSY( 'M', UPLO, N, A, LDA, RWORK )
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ DO 10 J = 1, N
+ CALL ZDSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+ 10 CONTINUE
+ ELSE
+ DO 20 J = 1, N
+ CALL ZDSCAL( J, SIGMA, A( 1, J ), 1 )
+ 20 CONTINUE
+ END IF
+ IF( ABSTOL.GT.0 )
+ $ ABSTLL = ABSTOL*SIGMA
+ IF( VALEIG ) THEN
+ VLL = VL*SIGMA
+ VUU = VU*SIGMA
+ END IF
+ END IF
+
+* Initialize indices into workspaces. Note: The IWORK indices are
+* used only if DSTERF or ZSTEMR fail.
+
+* WORK(INDTAU:INDTAU+N-1) stores the complex scalar factors of the
+* elementary reflectors used in ZHETRD.
+ INDTAU = 1
+* INDWK is the starting offset of the remaining complex workspace,
+* and LLWORK is the remaining complex workspace size.
+ INDHOUS = INDTAU + N
+ INDWK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWK + 1
+
+* RWORK(INDRD:INDRD+N-1) stores the real tridiagonal's diagonal
+* entries.
+ INDRD = 1
+* RWORK(INDRE:INDRE+N-1) stores the off-diagonal entries of the
+* tridiagonal matrix from ZHETRD.
+ INDRE = INDRD + N
+* RWORK(INDRDD:INDRDD+N-1) is a copy of the diagonal entries over
+* -written by ZSTEMR (the DSTERF path copies the diagonal to W).
+ INDRDD = INDRE + N
+* RWORK(INDREE:INDREE+N-1) is a copy of the off-diagonal entries over
+* -written while computing the eigenvalues in DSTERF and ZSTEMR.
+ INDREE = INDRDD + N
+* INDRWK is the starting offset of the left-over real workspace, and
+* LLRWORK is the remaining workspace size.
+ INDRWK = INDREE + N
+ LLRWORK = LRWORK - INDRWK + 1
+
+* IWORK(INDIBL:INDIBL+M-1) corresponds to IBLOCK in DSTEBZ and
+* stores the block indices of each of the M<=N eigenvalues.
+ INDIBL = 1
+* IWORK(INDISP:INDISP+NSPLIT-1) corresponds to ISPLIT in DSTEBZ and
+* stores the starting and finishing indices of each block.
+ INDISP = INDIBL + N
+* IWORK(INDIFL:INDIFL+N-1) stores the indices of eigenvectors
+* that corresponding to eigenvectors that fail to converge in
+* ZSTEIN. This information is discarded; if any fail, the driver
+* returns INFO > 0.
+ INDIFL = INDISP + N
+* INDIWO is the offset of the remaining integer workspace.
+ INDIWO = INDIFL + N
+
+*
+* Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+ CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ),
+ $ RWORK( INDRE ), WORK( INDTAU ),
+ $ WORK( INDHOUS ), LHTRD,
+ $ WORK( INDWK ), LLWORK, IINFO )
+*
+* If all eigenvalues are desired
+* then call DSTERF or ZSTEMR and ZUNMTR.
+*
+ TEST = .FALSE.
+ IF( INDEIG ) THEN
+ IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+ TEST = .TRUE.
+ END IF
+ END IF
+ IF( ( ALLEIG.OR.TEST ) .AND. ( IEEEOK.EQ.1 ) ) THEN
+ IF( .NOT.WANTZ ) THEN
+ CALL DCOPY( N, RWORK( INDRD ), 1, W, 1 )
+ CALL DCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 )
+ CALL DSTERF( N, W, RWORK( INDREE ), INFO )
+ ELSE
+ CALL DCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 )
+ CALL DCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 )
+*
+ IF (ABSTOL .LE. TWO*N*EPS) THEN
+ TRYRAC = .TRUE.
+ ELSE
+ TRYRAC = .FALSE.
+ END IF
+ CALL ZSTEMR( JOBZ, 'A', N, RWORK( INDRDD ),
+ $ RWORK( INDREE ), VL, VU, IL, IU, M, W,
+ $ Z, LDZ, N, ISUPPZ, TRYRAC,
+ $ RWORK( INDRWK ), LLRWORK,
+ $ IWORK, LIWORK, INFO )
+*
+* Apply unitary matrix used in reduction to tridiagonal
+* form to eigenvectors returned by ZSTEMR.
+*
+ IF( WANTZ .AND. INFO.EQ.0 ) THEN
+ INDWKN = INDWK
+ LLWRKN = LWORK - INDWKN + 1
+ CALL ZUNMTR( 'L', UPLO, 'N', N, M, A, LDA,
+ $ WORK( INDTAU ), Z, LDZ, WORK( INDWKN ),
+ $ LLWRKN, IINFO )
+ END IF
+ END IF
+*
+*
+ IF( INFO.EQ.0 ) THEN
+ M = N
+ GO TO 30
+ END IF
+ INFO = 0
+ END IF
+*
+* Otherwise, call DSTEBZ and, if eigenvectors are desired, ZSTEIN.
+* Also call DSTEBZ and ZSTEIN if ZSTEMR fails.
+*
+ IF( WANTZ ) THEN
+ ORDER = 'B'
+ ELSE
+ ORDER = 'E'
+ END IF
+
+ CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+ $ RWORK( INDRD ), RWORK( INDRE ), M, NSPLIT, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ),
+ $ IWORK( INDIWO ), INFO )
+*
+ IF( WANTZ ) THEN
+ CALL ZSTEIN( N, RWORK( INDRD ), RWORK( INDRE ), M, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+ $ RWORK( INDRWK ), IWORK( INDIWO ), IWORK( INDIFL ),
+ $ INFO )
+*
+* Apply unitary matrix used in reduction to tridiagonal
+* form to eigenvectors returned by ZSTEIN.
+*
+ INDWKN = INDWK
+ LLWRKN = LWORK - INDWKN + 1
+ CALL ZUNMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+ $ LDZ, WORK( INDWKN ), LLWRKN, IINFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ 30 CONTINUE
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = M
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* If eigenvalues are not in order, then sort them, along with
+* eigenvectors.
+*
+ IF( WANTZ ) THEN
+ DO 50 J = 1, M - 1
+ I = 0
+ TMP1 = W( J )
+ DO 40 JJ = J + 1, M
+ IF( W( JJ ).LT.TMP1 ) THEN
+ I = JJ
+ TMP1 = W( JJ )
+ END IF
+ 40 CONTINUE
+*
+ IF( I.NE.0 ) THEN
+ ITMP1 = IWORK( INDIBL+I-1 )
+ W( I ) = W( J )
+ IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+ W( J ) = TMP1
+ IWORK( INDIBL+J-1 ) = ITMP1
+ CALL ZSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+ END IF
+ 50 CONTINUE
+ END IF
+*
+* Set WORK(1) to optimal workspace size.
+*
+ WORK( 1 ) = LWMIN
+ RWORK( 1 ) = LRWMIN
+ IWORK( 1 ) = LIWMIN
+*
+ RETURN
+*
+* End of ZHEEVR_2STAGE
+*
+ END
diff --git a/SRC/zheevx_2stage.f b/SRC/zheevx_2stage.f
new file mode 100644
index 00000000..e33d55e0
--- /dev/null
+++ b/SRC/zheevx_2stage.f
@@ -0,0 +1,618 @@
+*> \brief <b> ZHEEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+* @precisions fortran z -> s d c
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHEEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zheevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zheevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zheevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZHEEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+* IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+* LWORK, RWORK, IWORK, IFAIL, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, RANGE, UPLO
+* INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N
+* DOUBLE PRECISION ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+* INTEGER IFAIL( * ), IWORK( * )
+* DOUBLE PRECISION RWORK( * ), W( * )
+* COMPLEX*16 A( LDA, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZHEEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors can
+*> be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*> RANGE is CHARACTER*1
+*> = 'A': all eigenvalues will be found.
+*> = 'V': all eigenvalues in the half-open interval (VL,VU]
+*> will be found.
+*> = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX*16 array, dimension (LDA, N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*> On exit, the lower triangle (if UPLO='L') or the upper
+*> triangle (if UPLO='U') of A, including the diagonal, is
+*> destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*> VL is DOUBLE PRECISION
+*> If RANGE='V', the lower bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*> VU is DOUBLE PRECISION
+*> If RANGE='V', the upper bound of the interval to
+*> be searched for eigenvalues. VL < VU.
+*> Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*> IL is INTEGER
+*> If RANGE='I', the index of the
+*> smallest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*> IU is INTEGER
+*> If RANGE='I', the index of the
+*> largest eigenvalue to be returned.
+*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*> Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*> ABSTOL is DOUBLE PRECISION
+*> The absolute error tolerance for the eigenvalues.
+*> An approximate eigenvalue is accepted as converged
+*> when it is determined to lie in an interval [a,b]
+*> of width less than or equal to
+*>
+*> ABSTOL + EPS * max( |a|,|b| ) ,
+*>
+*> where EPS is the machine precision. If ABSTOL is less than
+*> or equal to zero, then EPS*|T| will be used in its place,
+*> where |T| is the 1-norm of the tridiagonal matrix obtained
+*> by reducing A to tridiagonal form.
+*>
+*> Eigenvalues will be computed most accurately when ABSTOL is
+*> set to twice the underflow threshold 2*DLAMCH('S'), not zero.
+*> If this routine returns with INFO>0, indicating that some
+*> eigenvectors did not converge, try setting ABSTOL to
+*> 2*DLAMCH('S').
+*>
+*> See "Computing Small Singular Values of Bidiagonal Matrices
+*> with Guaranteed High Relative Accuracy," by Demmel and
+*> Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*> M is INTEGER
+*> The total number of eigenvalues found. 0 <= M <= N.
+*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> On normal exit, the first M elements contain the selected
+*> eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*> Z is COMPLEX*16 array, dimension (LDZ, max(1,M))
+*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*> contain the orthonormal eigenvectors of the matrix A
+*> corresponding to the selected eigenvalues, with the i-th
+*> column of Z holding the eigenvector associated with W(i).
+*> If an eigenvector fails to converge, then that column of Z
+*> contains the latest approximation to the eigenvector, and the
+*> index of the eigenvector is returned in IFAIL.
+*> If JOBZ = 'N', then Z is not referenced.
+*> Note: the user must ensure that at least max(1,M) columns are
+*> supplied in the array Z; if RANGE = 'V', the exact value of M
+*> is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is INTEGER
+*> The leading dimension of the array Z. LDZ >= 1, and if
+*> JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, 8*N, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is DOUBLE PRECISION array, dimension (7*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*> IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*> IFAIL is INTEGER array, dimension (N)
+*> If JOBZ = 'V', then if INFO = 0, the first M elements of
+*> IFAIL are zero. If INFO > 0, then IFAIL contains the
+*> indices of the eigenvectors that failed to converge.
+*> If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: if INFO = i, then i eigenvectors failed to converge.
+*> Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup complex16HEeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE ZHEEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+ $ IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+ $ LWORK, RWORK, IWORK, IFAIL, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.1) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* June 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, RANGE, UPLO
+ INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N
+ DOUBLE PRECISION ABSTOL, VL, VU
+* ..
+* .. Array Arguments ..
+ INTEGER IFAIL( * ), IWORK( * )
+ DOUBLE PRECISION RWORK( * ), W( * )
+ COMPLEX*16 A( LDA, * ), WORK( * ), Z( LDZ, * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+ COMPLEX*16 CONE
+ PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG,
+ $ WANTZ
+ CHARACTER ORDER
+ INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+ $ INDISP, INDIWK, INDRWK, INDTAU, INDWRK, ISCALE,
+ $ ITMP1, J, JJ, LLWORK,
+ $ NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+ DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+ $ SIGMA, SMLNUM, TMP1, VLL, VUU
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ DOUBLE PRECISION DLAMCH, ZLANHE
+ EXTERNAL LSAME, ILAENV, DLAMCH, ZLANHE
+* ..
+* .. External Subroutines ..
+ EXTERNAL DCOPY, DSCAL, DSTEBZ, DSTERF, XERBLA, ZDSCAL,
+ $ ZLACPY, ZSTEIN, ZSTEQR, ZSWAP, ZUNGTR, ZUNMTR,
+ $ ZHETRD_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC DBLE, MAX, MIN, SQRT
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ LOWER = LSAME( UPLO, 'L' )
+ WANTZ = LSAME( JOBZ, 'V' )
+ ALLEIG = LSAME( RANGE, 'A' )
+ VALEIG = LSAME( RANGE, 'V' )
+ INDEIG = LSAME( RANGE, 'I' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -6
+ ELSE
+ IF( VALEIG ) THEN
+ IF( N.GT.0 .AND. VU.LE.VL )
+ $ INFO = -8
+ ELSE IF( INDEIG ) THEN
+ IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+ INFO = -9
+ ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+ INFO = -10
+ END IF
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+ INFO = -15
+ END IF
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ IF( N.LE.1 ) THEN
+ LWMIN = 1
+ WORK( 1 ) = LWMIN
+ ELSE
+ KD = ILAENV( 17, 'ZHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+ END IF
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+ $ INFO = -17
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHEEVX_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ M = 0
+ IF( N.EQ.0 ) THEN
+ RETURN
+ END IF
+*
+ IF( N.EQ.1 ) THEN
+ IF( ALLEIG .OR. INDEIG ) THEN
+ M = 1
+ W( 1 ) = DBLE( A( 1, 1 ) )
+ ELSE IF( VALEIG ) THEN
+ IF( VL.LT.DBLE( A( 1, 1 ) ) .AND. VU.GE.DBLE( A( 1, 1 ) ) )
+ $ THEN
+ M = 1
+ W( 1 ) = DBLE( A( 1, 1 ) )
+ END IF
+ END IF
+ IF( WANTZ )
+ $ Z( 1, 1 ) = CONE
+ RETURN
+ END IF
+*
+* Get machine constants.
+*
+ SAFMIN = DLAMCH( 'Safe minimum' )
+ EPS = DLAMCH( 'Precision' )
+ SMLNUM = SAFMIN / EPS
+ BIGNUM = ONE / SMLNUM
+ RMIN = SQRT( SMLNUM )
+ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+* Scale matrix to allowable range, if necessary.
+*
+ ISCALE = 0
+ ABSTLL = ABSTOL
+ IF( VALEIG ) THEN
+ VLL = VL
+ VUU = VU
+ END IF
+ ANRM = ZLANHE( 'M', UPLO, N, A, LDA, RWORK )
+ IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+ ISCALE = 1
+ SIGMA = RMIN / ANRM
+ ELSE IF( ANRM.GT.RMAX ) THEN
+ ISCALE = 1
+ SIGMA = RMAX / ANRM
+ END IF
+ IF( ISCALE.EQ.1 ) THEN
+ IF( LOWER ) THEN
+ DO 10 J = 1, N
+ CALL ZDSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+ 10 CONTINUE
+ ELSE
+ DO 20 J = 1, N
+ CALL ZDSCAL( J, SIGMA, A( 1, J ), 1 )
+ 20 CONTINUE
+ END IF
+ IF( ABSTOL.GT.0 )
+ $ ABSTLL = ABSTOL*SIGMA
+ IF( VALEIG ) THEN
+ VLL = VL*SIGMA
+ VUU = VU*SIGMA
+ END IF
+ END IF
+*
+* Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+ INDD = 1
+ INDE = INDD + N
+ INDRWK = INDE + N
+ INDTAU = 1
+ INDHOUS = INDTAU + N
+ INDWRK = INDHOUS + LHTRD
+ LLWORK = LWORK - INDWRK + 1
+*
+ CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDD ),
+ $ RWORK( INDE ), WORK( INDTAU ),
+ $ WORK( INDHOUS ), LHTRD, WORK( INDWRK ),
+ $ LLWORK, IINFO )
+*
+* If all eigenvalues are desired and ABSTOL is less than or equal to
+* zero, then call DSTERF or ZUNGTR and ZSTEQR. If this fails for
+* some eigenvalue, then try DSTEBZ.
+*
+ TEST = .FALSE.
+ IF( INDEIG ) THEN
+ IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+ TEST = .TRUE.
+ END IF
+ END IF
+ IF( ( ALLEIG .OR. TEST ) .AND. ( ABSTOL.LE.ZERO ) ) THEN
+ CALL DCOPY( N, RWORK( INDD ), 1, W, 1 )
+ INDEE = INDRWK + 2*N
+ IF( .NOT.WANTZ ) THEN
+ CALL DCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+ CALL DSTERF( N, W, RWORK( INDEE ), INFO )
+ ELSE
+ CALL ZLACPY( 'A', N, N, A, LDA, Z, LDZ )
+ CALL ZUNGTR( UPLO, N, Z, LDZ, WORK( INDTAU ),
+ $ WORK( INDWRK ), LLWORK, IINFO )
+ CALL DCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+ CALL ZSTEQR( JOBZ, N, W, RWORK( INDEE ), Z, LDZ,
+ $ RWORK( INDRWK ), INFO )
+ IF( INFO.EQ.0 ) THEN
+ DO 30 I = 1, N
+ IFAIL( I ) = 0
+ 30 CONTINUE
+ END IF
+ END IF
+ IF( INFO.EQ.0 ) THEN
+ M = N
+ GO TO 40
+ END IF
+ INFO = 0
+ END IF
+*
+* Otherwise, call DSTEBZ and, if eigenvectors are desired, ZSTEIN.
+*
+ IF( WANTZ ) THEN
+ ORDER = 'B'
+ ELSE
+ ORDER = 'E'
+ END IF
+ INDIBL = 1
+ INDISP = INDIBL + N
+ INDIWK = INDISP + N
+ CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+ $ RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ),
+ $ IWORK( INDIWK ), INFO )
+*
+ IF( WANTZ ) THEN
+ CALL ZSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W,
+ $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+ $ RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO )
+*
+* Apply unitary matrix used in reduction to tridiagonal
+* form to eigenvectors returned by ZSTEIN.
+*
+ CALL ZUNMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+ $ LDZ, WORK( INDWRK ), LLWORK, IINFO )
+ END IF
+*
+* If matrix was scaled, then rescale eigenvalues appropriately.
+*
+ 40 CONTINUE
+ IF( ISCALE.EQ.1 ) THEN
+ IF( INFO.EQ.0 ) THEN
+ IMAX = M
+ ELSE
+ IMAX = INFO - 1
+ END IF
+ CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+ END IF
+*
+* If eigenvalues are not in order, then sort them, along with
+* eigenvectors.
+*
+ IF( WANTZ ) THEN
+ DO 60 J = 1, M - 1
+ I = 0
+ TMP1 = W( J )
+ DO 50 JJ = J + 1, M
+ IF( W( JJ ).LT.TMP1 ) THEN
+ I = JJ
+ TMP1 = W( JJ )
+ END IF
+ 50 CONTINUE
+*
+ IF( I.NE.0 ) THEN
+ ITMP1 = IWORK( INDIBL+I-1 )
+ W( I ) = W( J )
+ IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+ W( J ) = TMP1
+ IWORK( INDIBL+J-1 ) = ITMP1
+ CALL ZSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+ IF( INFO.NE.0 ) THEN
+ ITMP1 = IFAIL( I )
+ IFAIL( I ) = IFAIL( J )
+ IFAIL( J ) = ITMP1
+ END IF
+ END IF
+ 60 CONTINUE
+ END IF
+*
+* Set WORK(1) to optimal complex workspace size.
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of ZHEEVX_2STAGE
+*
+ END
diff --git a/SRC/zhegv_2stage.f b/SRC/zhegv_2stage.f
new file mode 100644
index 00000000..5079d240
--- /dev/null
+++ b/SRC/zhegv_2stage.f
@@ -0,0 +1,379 @@
+*> \brief \b ZHEGV_2STAGE
+*
+* @precisions fortran z -> c
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHEGV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhegv_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhegv_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhegv_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZHEGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+* WORK, LWORK, RWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER JOBZ, UPLO
+* INTEGER INFO, ITYPE, LDA, LDB, LWORK, N
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION RWORK( * ), W( * )
+* COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZHEGV_2STAGE computes all the eigenvalues, and optionally, the eigenvectors
+*> of a complex generalized Hermitian-definite eigenproblem, of the form
+*> A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x.
+*> Here A and B are assumed to be Hermitian and B is also
+*> positive definite.
+*> This routine use the 2stage technique for the reduction to tridiagonal
+*> which showed higher performance on recent architecture and for large
+* sizes N>2000.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] ITYPE
+*> \verbatim
+*> ITYPE is INTEGER
+*> Specifies the problem type to be solved:
+*> = 1: A*x = (lambda)*B*x
+*> = 2: A*B*x = (lambda)*x
+*> = 3: B*A*x = (lambda)*x
+*> \endverbatim
+*>
+*> \param[in] JOBZ
+*> \verbatim
+*> JOBZ is CHARACTER*1
+*> = 'N': Compute eigenvalues only;
+*> = 'V': Compute eigenvalues and eigenvectors.
+*> Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangles of A and B are stored;
+*> = 'L': Lower triangles of A and B are stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrices A and B. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX*16 array, dimension (LDA, N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the
+*> leading N-by-N upper triangular part of A contains the
+*> upper triangular part of the matrix A. If UPLO = 'L',
+*> the leading N-by-N lower triangular part of A contains
+*> the lower triangular part of the matrix A.
+*>
+*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*> matrix Z of eigenvectors. The eigenvectors are normalized
+*> as follows:
+*> if ITYPE = 1 or 2, Z**H*B*Z = I;
+*> if ITYPE = 3, Z**H*inv(B)*Z = I.
+*> If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
+*> or the lower triangle (if UPLO='L') of A, including the
+*> diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*> B is COMPLEX*16 array, dimension (LDB, N)
+*> On entry, the Hermitian positive definite matrix B.
+*> If UPLO = 'U', the leading N-by-N upper triangular part of B
+*> contains the upper triangular part of the matrix B.
+*> If UPLO = 'L', the leading N-by-N lower triangular part of B
+*> contains the lower triangular part of the matrix B.
+*>
+*> On exit, if INFO <= N, the part of B containing the matrix is
+*> overwritten by the triangular factor U or L from the Cholesky
+*> factorization B = U**H*U or B = L*L**H.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*> LDB is INTEGER
+*> The leading dimension of the array B. LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*> W is DOUBLE PRECISION array, dimension (N)
+*> If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The length of the array WORK. LWORK >= 1, when N <= 1;
+*> otherwise
+*> If JOBZ = 'N' and N > 1, LWORK must be queried.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N + N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N + N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*> RWORK is DOUBLE PRECISION array, dimension (max(1, 3*N-2))
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> > 0: ZPOTRF or ZHEEV returned an error code:
+*> <= N: if INFO = i, ZHEEV failed to converge;
+*> i off-diagonal elements of an intermediate
+*> tridiagonal form did not converge to zero;
+*> > N: if INFO = N + i, for 1 <= i <= N, then the leading
+*> minor of order i of B is not positive definite.
+*> The factorization of B could not be completed and
+*> no eigenvalues or eigenvectors were computed.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16HEeigen
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> All details about the 2stage techniques are available in:
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*
+* =====================================================================
+ SUBROUTINE ZHEGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+ $ WORK, LWORK, RWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK driver routine (version 3.6.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER JOBZ, UPLO
+ INTEGER INFO, ITYPE, LDA, LDB, LWORK, N
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION RWORK( * ), W( * )
+ COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ COMPLEX*16 ONE
+ PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, UPPER, WANTZ
+ CHARACTER TRANS
+ INTEGER NEIG, LWMIN, LHTRD, LWTRD, KD, IB
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. External Subroutines ..
+ EXTERNAL XERBLA, ZHEGST, ZPOTRF, ZTRMM, ZTRSM,
+ $ ZHEEV_2STAGE
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters.
+*
+ WANTZ = LSAME( JOBZ, 'V' )
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 )
+*
+ INFO = 0
+ IF( ITYPE.LT.1 .OR. ITYPE.GT.3 ) THEN
+ INFO = -1
+ ELSE IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.( UPPER .OR. LSAME( UPLO, 'L' ) ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -6
+ ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
+ INFO = -8
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ KD = ILAENV( 17, 'ZHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+ LHTRD = ILAENV( 19, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWTRD = ILAENV( 20, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+ LWMIN = N + LHTRD + LWTRD
+ WORK( 1 ) = LWMIN
+*
+ IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -11
+ END IF
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHEGV_2STAGE ', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 )
+ $ RETURN
+*
+* Form a Cholesky factorization of B.
+*
+ CALL ZPOTRF( UPLO, N, B, LDB, INFO )
+ IF( INFO.NE.0 ) THEN
+ INFO = N + INFO
+ RETURN
+ END IF
+*
+* Transform problem to standard eigenvalue problem and solve.
+*
+ CALL ZHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
+ CALL ZHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W,
+ $ WORK, LWORK, RWORK, INFO )
+*
+ IF( WANTZ ) THEN
+*
+* Backtransform eigenvectors to the original problem.
+*
+ NEIG = N
+ IF( INFO.GT.0 )
+ $ NEIG = INFO - 1
+ IF( ITYPE.EQ.1 .OR. ITYPE.EQ.2 ) THEN
+*
+* For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
+* backtransform eigenvectors: x = inv(L)**H *y or inv(U)*y
+*
+ IF( UPPER ) THEN
+ TRANS = 'N'
+ ELSE
+ TRANS = 'C'
+ END IF
+*
+ CALL ZTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+ $ B, LDB, A, LDA )
+*
+ ELSE IF( ITYPE.EQ.3 ) THEN
+*
+* For B*A*x=(lambda)*x;
+* backtransform eigenvectors: x = L*y or U**H *y
+*
+ IF( UPPER ) THEN
+ TRANS = 'C'
+ ELSE
+ TRANS = 'N'
+ END IF
+*
+ CALL ZTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+ $ B, LDB, A, LDA )
+ END IF
+ END IF
+*
+ WORK( 1 ) = LWMIN
+*
+ RETURN
+*
+* End of ZHEGV_2STAGE
+*
+ END
diff --git a/SRC/zhetrd_2stage.f b/SRC/zhetrd_2stage.f
new file mode 100644
index 00000000..62fd7539
--- /dev/null
+++ b/SRC/zhetrd_2stage.f
@@ -0,0 +1,337 @@
+*> \brief \b ZHETRD_2STAGE
+*
+* @precisions fortran z -> s d c
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHETRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhetrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhetrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhetrd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
+* HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER VECT, UPLO
+* INTEGER N, LDA, LWORK, LHOUS2, INFO
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION D( * ), E( * )
+* COMPLEX*16 A( LDA, * ), TAU( * ),
+* HOUS2( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZHETRD_2STAGE reduces a complex Hermitian matrix A to real symmetric
+*> tridiagonal form T by a unitary similarity transformation:
+*> Q1**H Q2**H* A * Q2 * Q1 = T.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] VECT
+*> \verbatim
+*> VECT is CHARACTER*1
+*> = 'N': No need for the Housholder representation,
+*> in particular for the second stage (Band to
+*> tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
+*> = 'V': the Householder representation is needed to
+*> either generate Q1 Q2 or to apply Q1 Q2,
+*> then LHOUS2 is to be queried and computed.
+*> (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX*16 array, dimension (LDA,N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the leading
+*> N-by-N upper triangular part of A contains the upper
+*> triangular part of the matrix A, and the strictly lower
+*> triangular part of A is not referenced. If UPLO = 'L', the
+*> leading N-by-N lower triangular part of A contains the lower
+*> triangular part of the matrix A, and the strictly upper
+*> triangular part of A is not referenced.
+*> On exit, if UPLO = 'U', the band superdiagonal
+*> of A are overwritten by the corresponding elements of the
+*> internal band-diagonal matrix AB, and the elements above
+*> the KD superdiagonal, with the array TAU, represent the unitary
+*> matrix Q1 as a product of elementary reflectors; if UPLO
+*> = 'L', the diagonal and band subdiagonal of A are over-
+*> written by the corresponding elements of the internal band-diagonal
+*> matrix AB, and the elements below the KD subdiagonal, with
+*> the array TAU, represent the unitary matrix Q1 as a product
+*> of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*> D is DOUBLE PRECISION array, dimension (N)
+*> The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*> E is DOUBLE PRECISION array, dimension (N-1)
+*> The off-diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*> TAU is COMPLEX*16 array, dimension (N-KD)
+*> The scalar factors of the elementary reflectors of
+*> the first stage (see Further Details).
+*> \endverbatim
+*>
+*> \param[out] HOUS2
+*> \verbatim
+*> HOUS2 is COMPLEX*16 array, dimension LHOUS2, that
+*> store the Householder representation of the stage2
+*> band to tridiagonal.
+*> \endverbatim
+*>
+*> \param[in] LHOUS2
+*> \verbatim
+*> LHOUS2 is INTEGER
+*> The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS2=-1,
+*> then a query is assumed; the routine
+*> only calculates the optimal size of the HOUS2 array, returns
+*> this value as the first entry of the HOUS2 array, and no error
+*> message related to LHOUS2 is issued by XERBLA.
+*> LHOUS2 = MAX(1, dimension) where
+*> dimension = 4*N if VECT='N'
+*> not available now if VECT='H'
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX*16 array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK. LWORK = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS2=-1,
+*> then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> LWORK = MAX(1, dimension) where
+*> dimension = max(stage1,stage2) + (KD+1)*N
+*> = N*KD + N*max(KD+1,FACTOPTNB)
+*> + max(2*KD*KD, KD*NTHREADS)
+*> + (KD+1)*N
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16HEcomputational
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
+ $ HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER VECT, UPLO
+ INTEGER N, LDA, LWORK, LHOUS2, INFO
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION D( * ), E( * )
+ COMPLEX*16 A( LDA, * ), TAU( * ),
+ $ HOUS2( * ), WORK( * )
+* ..
+*
+* =====================================================================
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, UPPER, WANTQ
+ INTEGER KD, IB, LWMIN, LHMIN, LWRK, LDAB, WPOS, ABPOS
+* ..
+* .. External Subroutines ..
+ EXTERNAL XERBLA, ZHETRD_HE2HB, ZHETRD_HB2ST
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Test the input parameters
+*
+ INFO = 0
+ WANTQ = LSAME( VECT, 'V' )
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS2.EQ.-1 )
+*
+* Determine the block size, the workspace size and the hous size.
+*
+ KD = ILAENV( 17, 'ZHETRD_2STAGE', VECT, N, -1, -1, -1 )
+ IB = ILAENV( 18, 'ZHETRD_2STAGE', VECT, N, KD, -1, -1 )
+ LHMIN = ILAENV( 19, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
+ LWMIN = ILAENV( 20, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
+* WRITE(*,*),'ZHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
+* $ LHMIN, LWMIN
+*
+ IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+ INFO = -2
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ ELSE IF( LHOUS2.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -10
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -12
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ HOUS2( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHETRD_2STAGE', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 ) THEN
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+* Determine pointer position
+*
+ LDAB = KD+1
+ LWRK = LWORK-LDAB*N
+ ABPOS = 1
+ WPOS = ABPOS + LDAB*N
+ CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
+ $ TAU, WORK( WPOS ), LWRK, INFO )
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHETRD_HE2HB', -INFO )
+ RETURN
+ END IF
+ CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD,
+ $ WORK( ABPOS ), LDAB, D, E,
+ $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHETRD_HB2ST', -INFO )
+ RETURN
+ END IF
+*
+*
+ HOUS2( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of ZHETRD_2STAGE
+*
+ END
diff --git a/SRC/zhetrd_hb2st.F b/SRC/zhetrd_hb2st.F
new file mode 100644
index 00000000..5d62e30d
--- /dev/null
+++ b/SRC/zhetrd_hb2st.F
@@ -0,0 +1,603 @@
+*> \brief \b ZHBTRD
+*
+* @precisions fortran z -> s d c
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHBTRD + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhbtrd.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhbtrd.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhbtrd.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
+* D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+* #define PRECISION_COMPLEX
+*
+* #if defined(_OPENMP)
+* use omp_lib
+* #endif
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER STAGE1, UPLO, VECT
+* INTEGER N, KD, IB, LDAB, LHOUS, LWORK, INFO
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION D( * ), E( * )
+* COMPLEX*16 AB( LDAB, * ), HOUS( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZHBTRD reduces a complex Hermitian band matrix A to real symmetric
+*> tridiagonal form T by a unitary similarity transformation:
+*> Q**H * A * Q = T.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] STAGE
+*> \verbatim
+*> STAGE is CHARACTER*1
+*> = 'N': "No": to mention that the stage 1 of the reduction
+*> from dense to band using the zhetrd_he2hb routine
+*> was not called before this routine to reproduce AB.
+*> In other term this routine is called as standalone.
+*> = 'Y': "Yes": to mention that the stage 1 of the
+*> reduction from dense to band using the zhetrd_he2hb
+*> routine has been called to produce AB (e.g., AB is
+*> the output of zhetrd_he2hb.
+*> \endverbatim
+*>
+*> \param[in] VECT
+*> \verbatim
+*> VECT is CHARACTER*1
+*> = 'N': No need for the Housholder representation,
+*> and thus LHOUS is of size max(1, 4*N);
+*> = 'V': the Householder representation is needed to
+*> either generate or to apply Q later on,
+*> then LHOUS is to be queried and computed.
+*> (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the matrix A if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*> AB is COMPLEX*16 array, dimension (LDAB,N)
+*> On entry, the upper or lower triangle of the Hermitian band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*> On exit, the diagonal elements of AB are overwritten by the
+*> diagonal elements of the tridiagonal matrix T; if KD > 0, the
+*> elements on the first superdiagonal (if UPLO = 'U') or the
+*> first subdiagonal (if UPLO = 'L') are overwritten by the
+*> off-diagonal elements of T; the rest of AB is overwritten by
+*> values generated during the reduction.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*> D is DOUBLE PRECISION array, dimension (N)
+*> The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*> E is DOUBLE PRECISION array, dimension (N-1)
+*> The off-diagonal elements of the tridiagonal matrix T:
+*> E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
+*> \endverbatim
+*>
+*> \param[out] HOUS
+*> \verbatim
+*> HOUS is COMPLEX*16 array, dimension LHOUS, that
+*> store the Householder representation.
+*> \endverbatim
+*>
+*> \param[in] LHOUS
+*> \verbatim
+*> LHOUS is INTEGER
+*> The dimension of the array HOUS. LHOUS = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS=-1,
+*> then a query is assumed; the routine
+*> only calculates the optimal size of the HOUS array, returns
+*> this value as the first entry of the HOUS array, and no error
+*> message related to LHOUS is issued by XERBLA.
+*> LHOUS = MAX(1, dimension) where
+*> dimension = 4*N if VECT='N'
+*> not available now if VECT='H'
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX*16 array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK. LWORK = MAX(1, dimension)
+*> If LWORK = -1, or LHOUS=-1,
+*> then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> LWORK = MAX(1, dimension) where
+*> dimension = (2KD+1)*N + KD*NTHREADS
+*> where KD is the blocking size of the reduction,
+*> FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice
+*> NTHREADS is the number of threads used when
+*> openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16OTHERcomputational
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
+ $ D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+#define PRECISION_COMPLEX
+*
+#if defined(_OPENMP)
+ use omp_lib
+#endif
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER STAGE1, UPLO, VECT
+ INTEGER N, KD, LDAB, LHOUS, LWORK, INFO
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION D( * ), E( * )
+ COMPLEX*16 AB( LDAB, * ), HOUS( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION RZERO
+ COMPLEX*16 ZERO, ONE
+ PARAMETER ( RZERO = 0.0D+0,
+ $ ZERO = ( 0.0D+0, 0.0D+0 ),
+ $ ONE = ( 1.0D+0, 0.0D+0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, WANTQ, UPPER, AFTERS1
+ INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
+ $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
+ $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
+ $ NBTILES, TTYPE, TID, NTHREADS, DEBUG,
+ $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
+ $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
+ $ SIZEV, SIZETAU, LDV, LHMIN, LWMIN
+#if defined (PRECISION_COMPLEX)
+ DOUBLE PRECISION ABSTMP
+ COMPLEX*16 TMP
+#endif
+* ..
+* .. External Subroutines ..
+ EXTERNAL ZHB2ST_KERNELS, ZLACPY, ZLASET
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN, MAX, CEILING, DBLE, REAL
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Determine the minimal workspace size required.
+* Test the input parameters
+*
+ DEBUG = 0
+ INFO = 0
+ AFTERS1 = LSAME( STAGE1, 'Y' )
+ WANTQ = LSAME( VECT, 'V' )
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 )
+*
+* Determine the block size, the workspace size and the hous size.
+*
+ IB = ILAENV( 18, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
+ LHMIN = ILAENV( 19, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+ LWMIN = ILAENV( 20, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+*
+ IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
+ INFO = -1
+ ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+ INFO = -2
+ ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+ INFO = -3
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -4
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -5
+ ELSE IF( LDAB.LT.(KD+1) ) THEN
+ INFO = -7
+ ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -11
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -13
+ END IF
+*
+ IF( INFO.EQ.0 ) THEN
+ HOUS( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHETRD_HB2ST', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ RETURN
+ END IF
+*
+* Quick return if possible
+*
+ IF( N.EQ.0 ) THEN
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+* Determine pointer position
+*
+ LDV = KD + IB
+ SIZETAU = 2 * N
+ SIZEV = 2 * N
+ INDTAU = 1
+ INDV = INDTAU + SIZETAU
+ LDA = 2 * KD + 1
+ SIZEA = LDA * N
+ INDA = 1
+ INDW = INDA + SIZEA
+ NTHREADS = 1
+ TID = 0
+*
+ IF( UPPER ) THEN
+ APOS = INDA + KD
+ AWPOS = INDA
+ DPOS = APOS + KD
+ OFDPOS = DPOS - 1
+ ABDPOS = KD + 1
+ ABOFDPOS = KD
+ ELSE
+ APOS = INDA
+ AWPOS = INDA + KD + 1
+ DPOS = APOS
+ OFDPOS = DPOS + 1
+ ABDPOS = 1
+ ABOFDPOS = 2
+
+ ENDIF
+*
+* Case KD=0:
+* The matrix is diagonal. We just copy it (convert to "real" for
+* complex because D is double and the imaginary part should be 0)
+* and store it in D. A sequential code here is better or
+* in a parallel environment it might need two cores for D and E
+*
+ IF( KD.EQ.0 ) THEN
+ DO 30 I = 1, N
+ D( I ) = DBLE( AB( ABDPOS, I ) )
+ 30 CONTINUE
+ DO 40 I = 1, N-1
+ E( I ) = RZERO
+ 40 CONTINUE
+ GOTO 200
+ END IF
+*
+* Case KD=1:
+* The matrix is already Tridiagonal. We have to make diagonal
+* and offdiagonal elements real, and store them in D and E.
+* For that, for real precision just copy the diag and offdiag
+* to D and E while for the COMPLEX case the bulge chasing is
+* performed to convert the hermetian tridiagonal to symmetric
+* tridiagonal. A simpler coversion formula might be used, but then
+* updating the Q matrix will be required and based if Q is generated
+* or not this might complicate the story.
+*
+C IF( KD.EQ.1 .AND. N.GT.(KD+1) .AND. AFTERS1 ) THEN
+ IF( KD.EQ.1 ) THEN
+ DO 50 I = 1, N
+ D( I ) = DBLE( AB( ABDPOS, I ) )
+ 50 CONTINUE
+#if defined (PRECISION_COMPLEX)
+*
+* make off-diagonal elements real and copy them to E
+*
+ IF( UPPER ) THEN
+ DO 60 I = 1, N - 1
+ TMP = AB( ABOFDPOS, I+1 )
+ ABSTMP = ABS( TMP )
+ AB( ABOFDPOS, I+1 ) = ABSTMP
+ E( I ) = ABSTMP
+ IF( ABSTMP.NE.RZERO ) THEN
+ TMP = TMP / ABSTMP
+ ELSE
+ TMP = ONE
+ END IF
+ IF( I.LT.N-1 )
+ $ AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP
+C IF( WANTZ ) THEN
+C CALL ZSCAL( N, DCONJG( TMP ), Q( 1, I+1 ), 1 )
+C END IF
+ 60 CONTINUE
+ ELSE
+ DO 70 I = 1, N - 1
+ TMP = AB( ABOFDPOS, I )
+ ABSTMP = ABS( TMP )
+ AB( ABOFDPOS, I ) = ABSTMP
+ E( I ) = ABSTMP
+ IF( ABSTMP.NE.RZERO ) THEN
+ TMP = TMP / ABSTMP
+ ELSE
+ TMP = ONE
+ END IF
+ IF( I.LT.N-1 )
+ $ AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP
+C IF( WANTQ ) THEN
+C CALL ZSCAL( N, TMP, Q( 1, I+1 ), 1 )
+C END IF
+ 70 CONTINUE
+ ENDIF
+#else
+ IF( UPPER ) THEN
+ DO 60 I = 1, N-1
+ E( I ) = DBLE( AB( ABOFDPOS, I+1 ) )
+ 60 CONTINUE
+ ELSE
+ DO 70 I = 1, N-1
+ E( I ) = DBLE( AB( ABOFDPOS, I ) )
+ 70 CONTINUE
+ ENDIF
+#endif
+ GOTO 200
+ END IF
+*
+* Main code start here.
+* Reduce the hermitian band of A to a tridiagonal matrix.
+*
+ THGRSIZ = N
+ GRSIZ = 1
+ SHIFT = 3
+ NBTILES = CEILING( REAL(N)/REAL(KD) )
+ STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
+ THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) )
+*
+ CALL ZLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
+ CALL ZLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA )
+*
+*
+* openMP parallelisation start here
+*
+#if defined(_OPENMP)
+!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
+!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
+!$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
+!$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
+!$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
+!$OMP$ SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+!$OMP MASTER
+#endif
+*
+* main bulge chasing loop
+*
+ DO 100 THGRID = 1, THGRNB
+ STT = (THGRID-1)*THGRSIZ+1
+ THED = MIN( (STT + THGRSIZ -1), (N-1))
+ DO 110 I = STT, N-1
+ ED = MIN( I, THED )
+ IF( STT.GT.ED ) GOTO 100
+ DO 120 M = 1, STEPERCOL
+ ST = STT
+ DO 130 SWEEPID = ST, ED
+ DO 140 K = 1, GRSIZ
+ MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ)
+ $ + (M-1)*GRSIZ + K
+ IF ( MYID.EQ.1 ) THEN
+ TTYPE = 1
+ ELSE
+ TTYPE = MOD( MYID, 2 ) + 2
+ ENDIF
+
+ IF( TTYPE.EQ.2 ) THEN
+ COLPT = (MYID/2)*KD + SWEEPID
+ STIND = COLPT-KD+1
+ EDIND = MIN(COLPT,N)
+ BLKLASTIND = COLPT
+ ELSE
+ COLPT = ((MYID+1)/2)*KD + SWEEPID
+ STIND = COLPT-KD+1
+ EDIND = MIN(COLPT,N)
+ IF( ( STIND.GE.EDIND-1 ).AND.
+ $ ( EDIND.EQ.N ) ) THEN
+ BLKLASTIND = N
+ ELSE
+ BLKLASTIND = 0
+ ENDIF
+ ENDIF
+*
+* Call the kernel
+*
+#if defined(_OPENMP)
+ IF( TTYPE.NE.1 ) THEN
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$ DEPEND(in:WORK(MYID-1))
+!$OMP$ DEPEND(out:WORK(MYID))
+ TID = OMP_GET_THREAD_NUM()
+ CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
+ $ STIND, EDIND, SWEEPID, N, KD, IB,
+ $ WORK ( INDA ), LDA,
+ $ HOUS( INDV ), HOUS( INDTAU ), LDV,
+ $ WORK( INDW + TID*KD ) )
+!$OMP END TASK
+ ELSE
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$ DEPEND(out:WORK(MYID))
+ TID = OMP_GET_THREAD_NUM()
+ CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
+ $ STIND, EDIND, SWEEPID, N, KD, IB,
+ $ WORK ( INDA ), LDA,
+ $ HOUS( INDV ), HOUS( INDTAU ), LDV,
+ $ WORK( INDW + TID*KD ) )
+!$OMP END TASK
+ ENDIF
+#else
+ CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
+ $ STIND, EDIND, SWEEPID, N, KD, IB,
+ $ WORK ( INDA ), LDA,
+ $ HOUS( INDV ), HOUS( INDTAU ), LDV,
+ $ WORK( INDW + TID*KD ) )
+#endif
+ IF ( BLKLASTIND.GE.(N-1) ) THEN
+ STT = STT + 1
+ GOTO 130
+ ENDIF
+ 140 CONTINUE
+ 130 CONTINUE
+ 120 CONTINUE
+ 110 CONTINUE
+ 100 CONTINUE
+*
+#if defined(_OPENMP)
+!$OMP END MASTER
+!$OMP END PARALLEL
+#endif
+*
+* Copy the diagonal from A to D. Note that D is REAL thus only
+* the Real part is needed, the imaginary part should be zero.
+*
+ DO 150 I = 1, N
+ D( I ) = DBLE( WORK( DPOS+(I-1)*LDA ) )
+ 150 CONTINUE
+*
+* Copy the off diagonal from A to E. Note that E is REAL thus only
+* the Real part is needed, the imaginary part should be zero.
+*
+ IF( UPPER ) THEN
+ DO 160 I = 1, N-1
+ E( I ) = DBLE( WORK( OFDPOS+I*LDA ) )
+ 160 CONTINUE
+ ELSE
+ DO 170 I = 1, N-1
+ E( I ) = DBLE( WORK( OFDPOS+(I-1)*LDA ) )
+ 170 CONTINUE
+ ENDIF
+*
+ 200 CONTINUE
+*
+ HOUS( 1 ) = LHMIN
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of ZHETRD_HB2ST
+*
+ END
+#undef PRECISION_COMPLEX
+
diff --git a/SRC/zhetrd_he2hb.f b/SRC/zhetrd_he2hb.f
new file mode 100644
index 00000000..9403b73e
--- /dev/null
+++ b/SRC/zhetrd_he2hb.f
@@ -0,0 +1,517 @@
+*> \brief \b ZHETRD_HE2HB
+*
+* @precisions fortran z -> s d c
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHETRD_HE2HB + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhetrd.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhetrd.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhetrd.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZHETRD_HE2HB( UPLO, N, KD, A, LDA, AB, LDAB, TAU,
+* WORK, LWORK, INFO )
+*
+* IMPLICIT NONE
+*
+* .. Scalar Arguments ..
+* CHARACTER UPLO
+* INTEGER INFO, LDA, LDAB, LWORK, N, KD
+* ..
+* .. Array Arguments ..
+* COMPLEX*16 A( LDA, * ), AB( LDAB, * ),
+* TAU( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZHETRD_HE2HB reduces a complex Hermitian matrix A to complex Hermitian
+*> band-diagonal form AB by a unitary similarity transformation:
+*> Q**H * A * Q = AB.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> = 'U': Upper triangle of A is stored;
+*> = 'L': Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The order of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*> KD is INTEGER
+*> The number of superdiagonals of the reduced matrix if UPLO = 'U',
+*> or the number of subdiagonals if UPLO = 'L'. KD >= 0.
+*> The reduced matrix is stored in the array AB.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*> A is COMPLEX*16 array, dimension (LDA,N)
+*> On entry, the Hermitian matrix A. If UPLO = 'U', the leading
+*> N-by-N upper triangular part of A contains the upper
+*> triangular part of the matrix A, and the strictly lower
+*> triangular part of A is not referenced. If UPLO = 'L', the
+*> leading N-by-N lower triangular part of A contains the lower
+*> triangular part of the matrix A, and the strictly upper
+*> triangular part of A is not referenced.
+*> On exit, if UPLO = 'U', the diagonal and first superdiagonal
+*> of A are overwritten by the corresponding elements of the
+*> tridiagonal matrix T, and the elements above the first
+*> superdiagonal, with the array TAU, represent the unitary
+*> matrix Q as a product of elementary reflectors; if UPLO
+*> = 'L', the diagonal and first subdiagonal of A are over-
+*> written by the corresponding elements of the tridiagonal
+*> matrix T, and the elements below the first subdiagonal, with
+*> the array TAU, represent the unitary matrix Q as a product
+*> of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> The leading dimension of the array A. LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] AB
+*> \verbatim
+*> AB is COMPLEX*16 array, dimension (LDAB,N)
+*> On exit, the upper or lower triangle of the Hermitian band
+*> matrix A, stored in the first KD+1 rows of the array. The
+*> j-th column of A is stored in the j-th column of the array AB
+*> as follows:
+*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*> LDAB is INTEGER
+*> The leading dimension of the array AB. LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*> TAU is COMPLEX*16 array, dimension (N-KD)
+*> The scalar factors of the elementary reflectors (see Further
+*> Details).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX*16 array, dimension LWORK.
+*> On exit, if INFO = 0, or if LWORK=-1,
+*> WORK(1) returns the size of LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*> LWORK is INTEGER
+*> The dimension of the array WORK which should be calculated
+* by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*> If LWORK = -1, then a workspace query is assumed; the routine
+*> only calculates the optimal size of the WORK array, returns
+*> this value as the first entry of the WORK array, and no error
+*> message related to LWORK is issued by XERBLA.
+*> LWORK_QUERY = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD
+*> where FACTOPTNB is the blocking used by the QR or LQ
+*> algorithm, usually FACTOPTNB=128 is a good choice otherwise
+*> putting LWORK=-1 will provide the size of WORK.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*> INFO is INTEGER
+*> = 0: successful exit
+*> < 0: if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16HEcomputational
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> Implemented by Azzam Haidar.
+*>
+*> All details are available on technical report, SC11, SC13 papers.
+*>
+*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*> Parallel reduction to condensed forms for symmetric eigenvalue problems
+*> using aggregated fine-grained and memory-aware kernels. In Proceedings
+*> of 2011 International Conference for High Performance Computing,
+*> Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*> Article 8 , 11 pages.
+*> http://doi.acm.org/10.1145/2063384.2063394
+*>
+*> A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*> An improved parallel singular value algorithm and its implementation
+*> for multicore hardware, In Proceedings of 2013 International Conference
+*> for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*> Denver, Colorado, USA, 2013.
+*> Article 90, 12 pages.
+*> http://doi.acm.org/10.1145/2503210.2503292
+*>
+*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure
+*> calculations based on fine-grained memory aware tasks.
+*> International Journal of High Performance Computing Applications.
+*> Volume 28 Issue 2, Pages 196-209, May 2014.
+*> http://hpc.sagepub.com/content/28/2/196
+*>
+*> \endverbatim
+*>
+*> \verbatim
+*>
+*> If UPLO = 'U', the matrix Q is represented as a product of elementary
+*> reflectors
+*>
+*> Q = H(k)**H . . . H(2)**H H(1)**H, where k = n-kd.
+*>
+*> Each H(i) has the form
+*>
+*> H(i) = I - tau * v * v**H
+*>
+*> where tau is a complex scalar, and v is a complex vector with
+*> v(1:i+kd-1) = 0 and v(i+kd) = 1; conjg(v(i+kd+1:n)) is stored on exit in
+*> A(i,i+kd+1:n), and tau in TAU(i).
+*>
+*> If UPLO = 'L', the matrix Q is represented as a product of elementary
+*> reflectors
+*>
+*> Q = H(1) H(2) . . . H(k), where k = n-kd.
+*>
+*> Each H(i) has the form
+*>
+*> H(i) = I - tau * v * v**H
+*>
+*> where tau is a complex scalar, and v is a complex vector with
+*> v(kd+1:i) = 0 and v(i+kd+1) = 1; v(i+kd+2:n) is stored on exit in
+* A(i+kd+2:n,i), and tau in TAU(i).
+*>
+*> The contents of A on exit are illustrated by the following examples
+*> with n = 5:
+*>
+*> if UPLO = 'U': if UPLO = 'L':
+*>
+*> ( ab ab/v1 v1 v1 v1 ) ( ab )
+*> ( ab ab/v2 v2 v2 ) ( ab/v1 ab )
+*> ( ab ab/v3 v3 ) ( v1 ab/v2 ab )
+*> ( ab ab/v4 ) ( v1 v2 ab/v3 ab )
+*> ( ab ) ( v1 v2 v3 ab/v4 ab )
+*>
+*> where d and e denote diagonal and off-diagonal elements of T, and vi
+*> denotes an element of the vector defining H(i).
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE ZHETRD_HE2HB( UPLO, N, KD, A, LDA, AB, LDAB, TAU,
+ $ WORK, LWORK, INFO )
+*
+ IMPLICIT NONE
+*
+* -- LAPACK computational routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2016
+*
+* .. Scalar Arguments ..
+ CHARACTER UPLO
+ INTEGER INFO, LDA, LDAB, LWORK, N, KD
+* ..
+* .. Array Arguments ..
+ COMPLEX*16 A( LDA, * ), AB( LDAB, * ),
+ $ TAU( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION RONE
+ COMPLEX*16 ZERO, ONE, HALF
+ PARAMETER ( RONE = 1.0D+0,
+ $ ZERO = ( 0.0D+0, 0.0D+0 ),
+ $ ONE = ( 1.0D+0, 0.0D+0 ),
+ $ HALF = ( 0.5D+0, 0.0D+0 ) )
+* ..
+* .. Local Scalars ..
+ LOGICAL LQUERY, UPPER
+ INTEGER I, J, IINFO, LWMIN, PN, PK, LK,
+ $ LDT, LDW, LDS2, LDS1,
+ $ LS2, LS1, LW, LT,
+ $ TPOS, WPOS, S2POS, S1POS
+* ..
+* .. External Subroutines ..
+ EXTERNAL XERBLA, ZHER2K, ZHEMM, ZGEMM,
+ $ ZLARFT, ZGELQF, ZGEQRF, ZLASET
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN, MAX
+* ..
+* .. External Functions ..
+ LOGICAL LSAME
+ INTEGER ILAENV
+ EXTERNAL LSAME, ILAENV
+* ..
+* .. Executable Statements ..
+*
+* Determine the minimal workspace size required
+* and test the input parameters
+*
+ INFO = 0
+ UPPER = LSAME( UPLO, 'U' )
+ LQUERY = ( LWORK.EQ.-1 )
+ LWMIN = ILAENV( 20, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
+
+ IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+ INFO = -1
+ ELSE IF( N.LT.0 ) THEN
+ INFO = -2
+ ELSE IF( KD.LT.0 ) THEN
+ INFO = -3
+ ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+ INFO = -5
+ ELSE IF( LDAB.LT.MAX( 1, KD+1 ) ) THEN
+ INFO = -7
+ ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+ INFO = -10
+ END IF
+*
+ IF( INFO.NE.0 ) THEN
+ CALL XERBLA( 'ZHETRD_HE2HB', -INFO )
+ RETURN
+ ELSE IF( LQUERY ) THEN
+ WORK( 1 ) = LWMIN
+ RETURN
+ END IF
+*
+* Quick return if possible
+* Copy the upper/lower portion of A into AB
+*
+ IF( N.LE.KD+1 ) THEN
+ IF( UPPER ) THEN
+ DO 100 I = 1, N
+ LK = MIN( KD+1, I )
+ CALL ZCOPY( LK, A( I-LK+1, I ), 1,
+ $ AB( KD+1-LK+1, I ), 1 )
+ 100 CONTINUE
+ ELSE
+ DO 110 I = 1, N
+ LK = MIN( KD+1, N-I+1 )
+ CALL ZCOPY( LK, A( I, I ), 1, AB( 1, I ), 1 )
+ 110 CONTINUE
+ ENDIF
+ WORK( 1 ) = 1
+ RETURN
+ END IF
+*
+* Determine the pointer position for the workspace
+*
+ LDT = KD
+ LDS1 = KD
+ LT = LDT*KD
+ LW = N*KD
+ LS1 = LDS1*KD
+ LS2 = LWMIN - LT - LW - LS1
+* LS2 = N*MAX(KD,FACTOPTNB)
+ TPOS = 1
+ WPOS = TPOS + LT
+ S1POS = WPOS + LW
+ S2POS = S1POS + LS1
+ IF( UPPER ) THEN
+ LDW = KD
+ LDS2 = KD
+ ELSE
+ LDW = N
+ LDS2 = N
+ ENDIF
+*
+*
+* Set the workspace of the triangular matrix T to zero once such a
+* way everytime T is generated the upper/lower portion will be always zero
+*
+ CALL ZLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT )
+*
+ IF( UPPER ) THEN
+ DO 10 I = 1, N - KD, KD
+ PN = N-I-KD+1
+ PK = MIN( N-I-KD+1, KD )
+*
+* Compute the LQ factorization of the current block
+*
+ CALL ZGELQF( KD, PN, A( I, I+KD ), LDA,
+ $ TAU( I ), WORK( S2POS ), LS2, IINFO )
+*
+* Copy the upper portion of A into AB
+*
+ DO 20 J = I, I+PK-1
+ LK = MIN( KD, N-J ) + 1
+ CALL ZCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+ 20 CONTINUE
+*
+ CALL ZLASET( 'Lower', PK, PK, ZERO, ONE,
+ $ A( I, I+KD ), LDA )
+*
+* Form the matrix T
+*
+ CALL ZLARFT( 'Forward', 'Rowwise', PN, PK,
+ $ A( I, I+KD ), LDA, TAU( I ),
+ $ WORK( TPOS ), LDT )
+*
+* Compute W:
+*
+ CALL ZGEMM( 'Conjugate', 'No transpose', PK, PN, PK,
+ $ ONE, WORK( TPOS ), LDT,
+ $ A( I, I+KD ), LDA,
+ $ ZERO, WORK( S2POS ), LDS2 )
+*
+ CALL ZHEMM( 'Right', UPLO, PK, PN,
+ $ ONE, A( I+KD, I+KD ), LDA,
+ $ WORK( S2POS ), LDS2,
+ $ ZERO, WORK( WPOS ), LDW )
+*
+ CALL ZGEMM( 'No transpose', 'Conjugate', PK, PK, PN,
+ $ ONE, WORK( WPOS ), LDW,
+ $ WORK( S2POS ), LDS2,
+ $ ZERO, WORK( S1POS ), LDS1 )
+*
+ CALL ZGEMM( 'No transpose', 'No transpose', PK, PN, PK,
+ $ -HALF, WORK( S1POS ), LDS1,
+ $ A( I, I+KD ), LDA,
+ $ ONE, WORK( WPOS ), LDW )
+*
+*
+* Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+* an update of the form: A := A - V'*W - W'*V
+*
+ CALL ZHER2K( UPLO, 'Conjugate', PN, PK,
+ $ -ONE, A( I, I+KD ), LDA,
+ $ WORK( WPOS ), LDW,
+ $ RONE, A( I+KD, I+KD ), LDA )
+ 10 CONTINUE
+*
+* Copy the upper band to AB which is the band storage matrix
+*
+ DO 30 J = N-KD+1, N
+ LK = MIN(KD, N-J) + 1
+ CALL ZCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+ 30 CONTINUE
+*
+ ELSE
+*
+* Reduce the lower triangle of A to lower band matrix
+*
+ DO 40 I = 1, N - KD, KD
+ PN = N-I-KD+1
+ PK = MIN( N-I-KD+1, KD )
+*
+* Compute the QR factorization of the current block
+*
+ CALL ZGEQRF( PN, KD, A( I+KD, I ), LDA,
+ $ TAU( I ), WORK( S2POS ), LS2, IINFO )
+*
+* Copy the upper portion of A into AB
+*
+ DO 50 J = I, I+PK-1
+ LK = MIN( KD, N-J ) + 1
+ CALL ZCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+ 50 CONTINUE
+*
+ CALL ZLASET( 'Upper', PK, PK, ZERO, ONE,
+ $ A( I+KD, I ), LDA )
+*
+* Form the matrix T
+*
+ CALL ZLARFT( 'Forward', 'Columnwise', PN, PK,
+ $ A( I+KD, I ), LDA, TAU( I ),
+ $ WORK( TPOS ), LDT )
+*
+* Compute W:
+*
+ CALL ZGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+ $ ONE, A( I+KD, I ), LDA,
+ $ WORK( TPOS ), LDT,
+ $ ZERO, WORK( S2POS ), LDS2 )
+*
+ CALL ZHEMM( 'Left', UPLO, PN, PK,
+ $ ONE, A( I+KD, I+KD ), LDA,
+ $ WORK( S2POS ), LDS2,
+ $ ZERO, WORK( WPOS ), LDW )
+*
+ CALL ZGEMM( 'Conjugate', 'No transpose', PK, PK, PN,
+ $ ONE, WORK( S2POS ), LDS2,
+ $ WORK( WPOS ), LDW,
+ $ ZERO, WORK( S1POS ), LDS1 )
+*
+ CALL ZGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+ $ -HALF, A( I+KD, I ), LDA,
+ $ WORK( S1POS ), LDS1,
+ $ ONE, WORK( WPOS ), LDW )
+*
+*
+* Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+* an update of the form: A := A - V*W' - W*V'
+*
+ CALL ZHER2K( UPLO, 'No transpose', PN, PK,
+ $ -ONE, A( I+KD, I ), LDA,
+ $ WORK( WPOS ), LDW,
+ $ RONE, A( I+KD, I+KD ), LDA )
+* ==================================================================
+* RESTORE A FOR COMPARISON AND CHECKING TO BE REMOVED
+* DO 45 J = I, I+PK-1
+* LK = MIN( KD, N-J ) + 1
+* CALL ZCOPY( LK, AB( 1, J ), 1, A( J, J ), 1 )
+* 45 CONTINUE
+* ==================================================================
+ 40 CONTINUE
+*
+* Copy the lower band to AB which is the band storage matrix
+*
+ DO 60 J = N-KD+1, N
+ LK = MIN(KD, N-J) + 1
+ CALL ZCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+ 60 CONTINUE
+
+ END IF
+*
+ WORK( 1 ) = LWMIN
+ RETURN
+*
+* End of ZHETRD_HE2HB
+*
+ END
diff --git a/SRC/zlarfy.f b/SRC/zlarfy.f
new file mode 100644
index 00000000..39b795f0
--- /dev/null
+++ b/SRC/zlarfy.f
@@ -0,0 +1,163 @@
+*> \brief \b ZLARFY
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+* .. Scalar Arguments ..
+* CHARACTER UPLO
+* INTEGER INCV, LDC, N
+* COMPLEX*16 TAU
+* ..
+* .. Array Arguments ..
+* COMPLEX*16 C( LDC, * ), V( * ), WORK( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZLARFY applies an elementary reflector, or Householder matrix, H,
+*> to an n x n Hermitian matrix C, from both the left and the right.
+*>
+*> H is represented in the form
+*>
+*> H = I - tau * v * v'
+*>
+*> where tau is a scalar and v is a vector.
+*>
+*> If tau is zero, then H is taken to be the unit matrix.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> Specifies whether the upper or lower triangular part of the
+*> Hermitian matrix C is stored.
+*> = 'U': Upper triangle
+*> = 'L': Lower triangle
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of rows and columns of the matrix C. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is COMPLEX*16 array, dimension
+*> (1 + (N-1)*abs(INCV))
+*> The vector v as described above.
+*> \endverbatim
+*>
+*> \param[in] INCV
+*> \verbatim
+*> INCV is INTEGER
+*> The increment between successive elements of v. INCV must
+*> not be zero.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*> TAU is COMPLEX*16
+*> The value tau as described above.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is COMPLEX*16 array, dimension (LDC, N)
+*> On entry, the matrix C.
+*> On exit, C is overwritten by H * C * H'.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> The leading dimension of the array C. LDC >= max( 1, N ).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*> WORK is COMPLEX*16 array, dimension (N)
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2011
+*
+*> \ingroup complex16_eig
+*
+* =====================================================================
+ SUBROUTINE ZLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+* -- LAPACK test routine (version 3.4.0) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* November 2011
+*
+* .. Scalar Arguments ..
+ CHARACTER UPLO
+ INTEGER INCV, LDC, N
+ COMPLEX*16 TAU
+* ..
+* .. Array Arguments ..
+ COMPLEX*16 C( LDC, * ), V( * ), WORK( * )
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ COMPLEX*16 ONE, ZERO, HALF
+ PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ),
+ $ ZERO = ( 0.0D+0, 0.0D+0 ),
+ $ HALF = ( 0.5D+0, 0.0D+0 ) )
+* ..
+* .. Local Scalars ..
+ COMPLEX*16 ALPHA
+* ..
+* .. External Subroutines ..
+ EXTERNAL ZAXPY, ZHEMV, ZHER2
+* ..
+* .. External Functions ..
+ COMPLEX*16 ZDOTC
+ EXTERNAL ZDOTC
+* ..
+* .. Executable Statements ..
+*
+ IF( TAU.EQ.ZERO )
+ $ RETURN
+*
+* Form w:= C * v
+*
+ CALL ZHEMV( UPLO, N, ONE, C, LDC, V, INCV, ZERO, WORK, 1 )
+*
+ ALPHA = -HALF*TAU*ZDOTC( N, WORK, 1, V, INCV )
+ CALL ZAXPY( N, ALPHA, V, INCV, WORK, 1 )
+*
+* C := C - v * w' - w * v'
+*
+ CALL ZHER2( UPLO, N, -TAU, V, INCV, WORK, 1, C, LDC )
+*
+ RETURN
+*
+* End of ZLARFY
+*
+ END