55 files changed, 24112 insertions, 10 deletions
diff --git a/SRC/Makefile b/SRC/Makefile
index 01cf7021..c521d7f6 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -56,7 +56,7 @@ include ../make.inc
 #
 #######################################################################
 
-ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o xerbla_array.o iparmq.o	\
+ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o xerbla_array.o iparmq.o iparam2stage.o	\
     ilaprec.o ilatrans.o ilauplo.o iladiag.o chla_transtype.o \
     ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o
 
@@ -120,7 +120,7 @@ SLASRC = \
    slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \
    slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
    slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
-   slarf.o  slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slargv.o \
+   slarf.o  slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
    slarrv.o slartv.o  \
    slarz.o  slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \
    slasyf_rk.o \
@@ -167,7 +167,10 @@ SLASRC = \
    sgelqt.o sgelqt3.o sgemlqt.o \
    sgetsls.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \
    sgelq.o slaswlq.o slamswlq.o sgemlq.o \
-   stplqt.o stplqt2.o stpmlqt.o
+   stplqt.o stplqt2.o stpmlqt.o \
+   ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \
+   ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \
+   ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o
 
 DSLASRC = spotrs.o sgetrs.o spotrf.o sgetrf.o
 
@@ -224,7 +227,7 @@ CLASRC = \
    claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
    claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
    clarf.o  clarfb.o clarfg.o clarft.o clarfgp.o \
-   clarfx.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \
+   clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \
    clarz.o  clarzb.o clarzt.o clascl.o claset.o clasr.o  classq.o \
    claswp.o clasyf.o clasyf_rook.o clasyf_rk.o \
    clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrz.o \
@@ -263,7 +266,10 @@ CLASRC = \
    cgelqt.o cgelqt3.o cgemlqt.o \
    cgetsls.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \
    cgelq.o claswlq.o clamswlq.o cgemlq.o \
-   ctplqt.o ctplqt2.o ctpmlqt.o
+   ctplqt.o ctplqt2.o ctpmlqt.o \
+   chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \
+   cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \
+   chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o
 
 ifdef USEXBLAS
 CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \
@@ -306,7 +312,7 @@ DLASRC = \
    dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \
    dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
    dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
-   dlarf.o  dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o \
+   dlarf.o  dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
    dlargv.o dlarrv.o dlartv.o  \
    dlarz.o  dlarzb.o dlarzt.o dlaswp.o dlasy2.o \
    dlasyf.o dlasyf_rook.o dlasyf_rk.o \
@@ -355,7 +361,10 @@ DLASRC = \
    dgelqt.o dgelqt3.o dgemlqt.o \
    dgetsls.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \
    dgelq.o dlaswlq.o dlamswlq.o dgemlq.o \
-   dtplqt.o dtplqt2.o dtpmlqt.o 
+   dtplqt.o dtplqt2.o dtpmlqt.o \
+   dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \
+   dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \
+   dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o
 
 ifdef USEXBLAS
 DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o		\
@@ -413,7 +422,7 @@ ZLASRC = \
    zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
    zlarcm.o zlarf.o  zlarfb.o \
    zlarfg.o zlarft.o zlarfgp.o \
-   zlarfx.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \
+   zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \
    zlarz.o  zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o  \
    zlassq.o zlaswp.o zlasyf.o zlasyf_rook.o zlasyf_rk.o \
    zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlauu2.o \
@@ -455,7 +464,10 @@ ZLASRC = \
    zgelqt.o zgelqt3.o zgemlqt.o \
    zgetsls.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \
    zgelq.o zlaswlq.o zlamswlq.o zgemlq.o \
-   ztplqt.o ztplqt2.o ztpmlqt.o  
+   ztplqt.o ztplqt2.o ztpmlqt.o \
+   zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \
+   zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \
+   zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o
 
 ifdef USEXBLAS
 ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o		\
diff --git a/SRC/chb2st_kernels.f b/SRC/chb2st_kernels.f
new file mode 100644
index 00000000..8b0a4b28
--- /dev/null
+++ b/SRC/chb2st_kernels.f
@@ -0,0 +1,320 @@
+*> \brief \b CHB2ST_KERNELS
+*
+*  @generated from zhb2st_kernels.f, fortran z -> c, Sun Nov  6 19:34:06 2016
+*      
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download CHB2ST_KERNELS + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chb2st_kernels.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chb2st_kernels.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chb2st_kernels.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE  CHB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+*                                   ST, ED, SWEEP, N, NB, IB,
+*                                   A, LDA, V, TAU, LDVT, WORK)
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          UPLO
+*       LOGICAL            WANTZ
+*       INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            A( LDA, * ), V( * ), 
+*                          TAU( * ), WORK( * )
+*  
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CHB2ST_KERNELS is an internal routine used by the CHETRD_HB2ST
+*> subroutine.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> @param[in] n
+*>          The order of the matrix A.
+*>
+*> @param[in] nb
+*>          The size of the band.
+*>
+*> @param[in, out] A
+*>          A pointer to the matrix A.
+*>
+*> @param[in] lda
+*>          The leading dimension of the matrix A.
+*>
+*> @param[out] V
+*>          COMPLEX array, dimension 2*n if eigenvalues only are
+*>          requested or to be queried for vectors.
+*>
+*> @param[out] TAU
+*>          COMPLEX array, dimension (2*n).
+*>          The scalar factors of the Householder reflectors are stored
+*>          in this array.
+*>
+*> @param[in] st
+*>          internal parameter for indices.
+*>
+*> @param[in] ed
+*>          internal parameter for indices.
+*>
+*> @param[in] sweep
+*>          internal parameter for indices.
+*>
+*> @param[in] Vblksiz
+*>          internal parameter for indices.
+*>
+*> @param[in] wantz
+*>          logical which indicate if Eigenvalue are requested or both
+*>          Eigenvalue/Eigenvectors.
+*>
+*> @param[in] work
+*>          Workspace of size nb.
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE  CHB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+     $                            ST, ED, SWEEP, N, NB, IB,
+     $                            A, LDA, V, TAU, LDVT, WORK)
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          UPLO
+      LOGICAL            WANTZ
+      INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), V( * ), 
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   ONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, J1, J2, LM, LN, VPOS, TAUPOS,
+     $                   DPOS, OFDPOS, AJETER 
+      COMPLEX            CTMP 
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLARFG, CLARFX, CLARFY
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MOD
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     ..
+*     .. Executable Statements ..
+*      
+      AJETER = IB + LDVT
+      UPPER = LSAME( UPLO, 'U' )
+
+      IF( UPPER ) THEN
+          DPOS    = 2 * NB + 1
+          OFDPOS  = 2 * NB
+      ELSE
+          DPOS    = 1
+          OFDPOS  = 2
+      ENDIF
+
+*
+*     Upper case
+*  
+      IF( UPPER ) THEN
+*      
+          IF( WANTZ ) THEN
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ELSE
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ENDIF
+          GO TO ( 101, 102, 103 ) TTYPE
+*
+  101     CONTINUE
+          LM = ED - ST + 1
+*
+          V( VPOS ) = ONE
+          DO 10 I = 1, LM-1
+              V( VPOS+I )         = CONJG( A( OFDPOS-I, ST+I ) )
+              A( OFDPOS-I, ST+I ) = ZERO  
+   10     CONTINUE
+          CTMP = CONJG( A( OFDPOS, ST ) )
+          CALL CLARFG( LM, CTMP, V( VPOS+1 ), 1, 
+     $                                   TAU( TAUPOS ) )
+          A( OFDPOS, ST ) = CTMP
+* 
+  103     CONTINUE
+          LM = ED - ST + 1
+          CALL CLARFY( UPLO, LM, V( VPOS ), 1, CONJG( TAU( TAUPOS ) ),
+     $                             A( DPOS, ST ), LDA-1, WORK)
+          GOTO 300
+*
+  102     CONTINUE
+          J1 = ED+1
+          J2 = MIN( ED+NB, N )
+          LN = ED-ST+1
+          LM = J2-J1+1
+          IF( LM.GT.0) THEN
+              CALL CLARFX( 'Left', LN, LM, V( VPOS ),
+     $                     CONJG( TAU( TAUPOS ) ), A( DPOS-NB, J1 ),
+     $                     LDA-1, WORK)
+*
+              IF( WANTZ ) THEN
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ELSE
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ENDIF
+*
+              V( VPOS ) = ONE
+              DO 30 I = 1, LM-1
+                  V( VPOS+I )          = CONJG( A( DPOS-NB-I, J1+I ) )
+                  A( DPOS-NB-I, J1+I ) = ZERO
+   30         CONTINUE
+              CTMP = CONJG( A( DPOS-NB, J1 ) )
+              CALL CLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) )
+              A( DPOS-NB, J1 ) = CTMP
+*             
+              CALL CLARFX( 'Right', LN-1, LM, V( VPOS ), 
+     $                     TAU( TAUPOS ),
+     $                     A( DPOS-NB+1, J1 ), LDA-1, WORK)
+          ENDIF
+          GOTO 300
+*
+*     Lower case
+*  
+      ELSE
+*      
+          IF( WANTZ ) THEN
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ELSE
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ENDIF
+          GO TO ( 201, 202, 203 ) TTYPE
+*  
+  201     CONTINUE
+          LM = ED - ST + 1
+*
+          V( VPOS ) = ONE
+          DO 20 I = 1, LM-1
+              V( VPOS+I )         = A( OFDPOS+I, ST-1 )
+              A( OFDPOS+I, ST-1 ) = ZERO  
+   20     CONTINUE
+          CALL CLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, 
+     $                                   TAU( TAUPOS ) )
+*
+  203     CONTINUE
+          LM = ED - ST + 1
+*
+          CALL CLARFY( UPLO, LM, V( VPOS ), 1, CONJG( TAU( TAUPOS ) ),
+     $                                      A( DPOS, ST ), LDA-1, WORK)
+
+          GOTO 300
+*
+  202     CONTINUE
+          J1 = ED+1
+          J2 = MIN( ED+NB, N )
+          LN = ED-ST+1
+          LM = J2-J1+1
+*
+          IF( LM.GT.0) THEN
+              CALL CLARFX( 'Right', LM, LN, V( VPOS ), 
+     $                     TAU( TAUPOS ), A( DPOS+NB, ST ),
+     $                     LDA-1, WORK)
+*
+              IF( WANTZ ) THEN
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ELSE
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ENDIF
+*              
+              V( VPOS ) = ONE
+              DO 40 I = 1, LM-1
+                  V( VPOS+I )        = A( DPOS+NB+I, ST )
+                  A( DPOS+NB+I, ST ) = ZERO
+   40         CONTINUE
+              CALL CLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, 
+     $                                    TAU( TAUPOS ) )
+*                  
+              CALL CLARFX( 'Left', LM, LN-1, V( VPOS ), 
+     $                     CONJG( TAU( TAUPOS ) ),
+     $                     A( DPOS+NB-1, ST+1 ), LDA-1, WORK)
+
+          ENDIF
+          GOTO 300
+      ENDIF
+
+  300 CONTINUE    
+      RETURN
+*
+*     END OF CHB2ST_KERNELS
+*
+      END      
diff --git a/SRC/chbev_2stage.f b/SRC/chbev_2stage.f
new file mode 100644
index 00000000..182d3d93
--- /dev/null
+++ b/SRC/chbev_2stage.f
@@ -0,0 +1,386 @@
+*> \brief <b> CHBEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+*  @generated from zhbev_2stage.f, fortran z -> c, Sat Nov  5 23:18:20 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHBEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbev_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbev_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbev_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CHBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+*                                WORK, LWORK, RWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, KD, LDAB, LDZ, N, LWORK
+*       ..
+*       .. Array Arguments ..
+*       REAL               RWORK( * ), W( * )
+*       COMPLEX            AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CHBEV_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a complex Hermitian band matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX array, dimension (LDAB, N)
+*>          On entry, the upper or lower triangle of the Hermitian band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*>          On exit, AB is overwritten by values generated during the
+*>          reduction to tridiagonal form.  If UPLO = 'U', the first
+*>          superdiagonal and the diagonal of the tridiagonal matrix T
+*>          are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*>          the diagonal and first subdiagonal of T are returned in the
+*>          first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is COMPLEX array, dimension (LDZ, N)
+*>          If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*>          eigenvectors of the matrix A, with the i-th column of Z
+*>          holding the eigenvector associated with W(i).
+*>          If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension LWORK
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = (2KD+1)*N + KD*NTHREADS
+*>                                   where KD is the size of the band.
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK, RWORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (max(1,3*N-2))
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  if INFO = i, the algorithm failed to converge; i
+*>                off-diagonal elements of an intermediate tridiagonal
+*>                form did not converge to zero.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexOTHEReigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CHBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+     $                         WORK, LWORK, RWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, KD, LDAB, LDZ, N, LWORK
+*     ..
+*     .. Array Arguments ..
+      REAL               RWORK( * ), W( * )
+      COMPLEX            AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E0, ONE = 1.0E0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, WANTZ, LQUERY
+      INTEGER            IINFO, IMAX, INDE, INDWRK, INDRWK, ISCALE,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS
+      REAL               ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, CLANHB
+      EXTERNAL           LSAME, SLAMCH, CLANHB, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR
+     $                   CHETRD_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDAB.LT.KD+1 ) THEN
+         INFO = -6
+      ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+         INFO = -9
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            WORK( 1 ) = LWMIN
+         ELSE
+            IB    = ILAENV( 18, 'CHETRD_HB2ST', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+            LWMIN = LHTRD + LWTRD
+            WORK( 1 )  = LWMIN
+         ENDIF
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -11
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHBEV_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            W( 1 ) = REAL( AB( 1, 1 ) )
+         ELSE
+            W( 1 ) = REAL( AB( KD+1, 1 ) )
+         END IF
+         IF( WANTZ )
+     $      Z( 1, 1 ) = ONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = CLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            CALL CLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         ELSE
+            CALL CLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         END IF
+      END IF
+*
+*     Call CHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDHOUS = 1
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL CHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+     $                    RWORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call SSTERF.  For eigenvectors, call CSTEQR.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL SSTERF( N, W, RWORK( INDE ), INFO )
+      ELSE
+         INDRWK = INDE + N
+         CALL CSTEQR( JOBZ, N, W, RWORK( INDE ), Z, LDZ,
+     $                RWORK( INDRWK ), INFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = N
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of CHBEV_2STAGE
+*
+      END
diff --git a/SRC/chbevd_2stage.f b/SRC/chbevd_2stage.f
new file mode 100644
index 00000000..89c118d3
--- /dev/null
+++ b/SRC/chbevd_2stage.f
@@ -0,0 +1,458 @@
+*> \brief <b> CHBEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+*  @generated from zhbevd_2stage.f, fortran z -> c, Sat Nov  5 23:18:17 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHBEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CHBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+*                                 WORK, LWORK, RWORK, LRWORK, IWORK, 
+*                                 LIWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, KD, LDAB, LDZ, LIWORK, LRWORK, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IWORK( * )
+*       REAL               RWORK( * ), W( * )
+*       COMPLEX            AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CHBEVD_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a complex Hermitian band matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  If eigenvectors are desired, it
+*> uses a divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX array, dimension (LDAB, N)
+*>          On entry, the upper or lower triangle of the Hermitian band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*>          On exit, AB is overwritten by values generated during the
+*>          reduction to tridiagonal form.  If UPLO = 'U', the first
+*>          superdiagonal and the diagonal of the tridiagonal matrix T
+*>          are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*>          the diagonal and first subdiagonal of T are returned in the
+*>          first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is COMPLEX array, dimension (LDZ, N)
+*>          If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*>          eigenvectors of the matrix A, with the i-th column of Z
+*>          holding the eigenvector associated with W(i).
+*>          If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = (2KD+1)*N + KD*NTHREADS
+*>                                   where KD is the size of the band.
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK, RWORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array,
+*>                                         dimension (LRWORK)
+*>          On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          The dimension of array RWORK.
+*>          If N <= 1,               LRWORK must be at least 1.
+*>          If JOBZ = 'N' and N > 1, LRWORK must be at least N.
+*>          If JOBZ = 'V' and N > 1, LRWORK must be at least
+*>                        1 + 5*N + 2*N**2.
+*>
+*>          If LRWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK, RWORK
+*>          and IWORK arrays, returns these values as the first entries
+*>          of the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of array IWORK.
+*>          If JOBZ = 'N' or N <= 1, LIWORK must be at least 1.
+*>          If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N .
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK, RWORK
+*>          and IWORK arrays, returns these values as the first entries
+*>          of the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  if INFO = i, the algorithm failed to converge; i
+*>                off-diagonal elements of an intermediate tridiagonal
+*>                form did not converge to zero.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexOTHEReigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CHBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+     $                          WORK, LWORK, RWORK, LRWORK, IWORK, 
+     $                          LIWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, KD, LDAB, LDZ, LIWORK, LRWORK, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      REAL               RWORK( * ), W( * )
+      COMPLEX            AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E0, ONE = 1.0E0 )
+      COMPLEX            CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0E0, 0.0E0 ),
+     $                   CONE = ( 1.0E0, 0.0E0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, LQUERY, WANTZ
+      INTEGER            IINFO, IMAX, INDE, INDWK2, INDRWK, ISCALE,
+     $                   LLWORK, INDWK, LHTRD, LWTRD, IB, INDHOUS,
+     $                   LIWMIN, LLRWK, LLWK2, LRWMIN, LWMIN
+      REAL               ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, CLANHB
+      EXTERNAL           LSAME, SLAMCH, CLANHB, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SSCAL, SSTERF, XERBLA, CGEMM, CLACPY,
+     $                   CLASCL, CSTEDC, CHETRD_HB2ST
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 .OR. LRWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( N.LE.1 ) THEN
+         LWMIN = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         IB    = ILAENV( 18, 'CHETRD_HB2ST', JOBZ, N, KD, -1, -1 )
+         LHTRD = ILAENV( 19, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+         LWTRD = ILAENV( 20, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+         IF( WANTZ ) THEN
+            LWMIN = 2*N**2
+            LRWMIN = 1 + 5*N + 2*N**2
+            LIWMIN = 3 + 5*N
+         ELSE
+            LWMIN  = MAX( N, LHTRD + LWTRD )
+            LRWMIN = N
+            LIWMIN = 1
+         END IF
+      END IF
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDAB.LT.KD+1 ) THEN
+         INFO = -6
+      ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+         INFO = -9
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         WORK( 1 )  = LWMIN
+         RWORK( 1 ) = LRWMIN
+         IWORK( 1 ) = LIWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -13
+         ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHBEVD_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         W( 1 ) = REAL( AB( 1, 1 ) )
+         IF( WANTZ )
+     $      Z( 1, 1 ) = CONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = CLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            CALL CLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         ELSE
+            CALL CLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         END IF
+      END IF
+*
+*     Call CHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDRWK  = INDE + N
+      LLRWK   = LRWORK - INDRWK + 1
+      INDHOUS = 1
+      INDWK   = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWK + 1
+      INDWK2  = INDWK + N*N
+      LLWK2   = LWORK - INDWK2 + 1
+*
+      CALL CHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+     $                    RWORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call SSTERF.  For eigenvectors, call CSTEDC.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL SSTERF( N, W, RWORK( INDE ), INFO )
+      ELSE
+         CALL CSTEDC( 'I', N, W, RWORK( INDE ), WORK, N, WORK( INDWK2 ),
+     $                LLWK2, RWORK( INDRWK ), LLRWK, IWORK, LIWORK,
+     $                INFO )
+         CALL CGEMM( 'N', 'N', N, N, N, CONE, Z, LDZ, WORK, N, CZERO,
+     $               WORK( INDWK2 ), N )
+         CALL CLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = N
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+      WORK( 1 )  = LWMIN
+      RWORK( 1 ) = LRWMIN
+      IWORK( 1 ) = LIWMIN
+      RETURN
+*
+*     End of CHBEVD_2STAGE
+*
+      END
diff --git a/SRC/chbevx_2stage.f b/SRC/chbevx_2stage.f
new file mode 100644
index 00000000..07eb6153
--- /dev/null
+++ b/SRC/chbevx_2stage.f
@@ -0,0 +1,646 @@
+*> \brief <b> CHBEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+*  @generated from zhbevx_2stage.f, fortran z -> c, Sat Nov  5 23:18:22 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHBEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB,
+*                                 Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W,
+*                                 Z, LDZ, WORK, LWORK, RWORK, IWORK, 
+*                                 IFAIL, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, RANGE, UPLO
+*       INTEGER            IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+*       REAL               ABSTOL, VL, VU
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IFAIL( * ), IWORK( * )
+*       REAL               RWORK( * ), W( * )
+*       COMPLEX            AB( LDAB, * ), Q( LDQ, * ), WORK( * ),
+*      $                   Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CHBEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a complex Hermitian band matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  Eigenvalues and eigenvectors
+*> can be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*>          RANGE is CHARACTER*1
+*>          = 'A': all eigenvalues will be found;
+*>          = 'V': all eigenvalues in the half-open interval (VL,VU]
+*>                 will be found;
+*>          = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX array, dimension (LDAB, N)
+*>          On entry, the upper or lower triangle of the Hermitian band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*>          On exit, AB is overwritten by values generated during the
+*>          reduction to tridiagonal form.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] Q
+*> \verbatim
+*>          Q is COMPLEX array, dimension (LDQ, N)
+*>          If JOBZ = 'V', the N-by-N unitary matrix used in the
+*>                          reduction to tridiagonal form.
+*>          If JOBZ = 'N', the array Q is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDQ
+*> \verbatim
+*>          LDQ is INTEGER
+*>          The leading dimension of the array Q.  If JOBZ = 'V', then
+*>          LDQ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*>          VL is REAL
+*>          If RANGE='V', the lower bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*>          VU is REAL
+*>          If RANGE='V', the upper bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*>          IL is INTEGER
+*>          If RANGE='I', the index of the
+*>          smallest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*>          IU is INTEGER
+*>          If RANGE='I', the index of the
+*>          largest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL
+*>          The absolute error tolerance for the eigenvalues.
+*>          An approximate eigenvalue is accepted as converged
+*>          when it is determined to lie in an interval [a,b]
+*>          of width less than or equal to
+*>
+*>                  ABSTOL + EPS *   max( |a|,|b| ) ,
+*>
+*>          where EPS is the machine precision.  If ABSTOL is less than
+*>          or equal to zero, then  EPS*|T|  will be used in its place,
+*>          where |T| is the 1-norm of the tridiagonal matrix obtained
+*>          by reducing AB to tridiagonal form.
+*>
+*>          Eigenvalues will be computed most accurately when ABSTOL is
+*>          set to twice the underflow threshold 2*SLAMCH('S'), not zero.
+*>          If this routine returns with INFO>0, indicating that some
+*>          eigenvectors did not converge, try setting ABSTOL to
+*>          2*SLAMCH('S').
+*>
+*>          See "Computing Small Singular Values of Bidiagonal Matrices
+*>          with Guaranteed High Relative Accuracy," by Demmel and
+*>          Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*>          M is INTEGER
+*>          The total number of eigenvalues found.  0 <= M <= N.
+*>          If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          The first M elements contain the selected eigenvalues in
+*>          ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is COMPLEX array, dimension (LDZ, max(1,M))
+*>          If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*>          contain the orthonormal eigenvectors of the matrix A
+*>          corresponding to the selected eigenvalues, with the i-th
+*>          column of Z holding the eigenvector associated with W(i).
+*>          If an eigenvector fails to converge, then that column of Z
+*>          contains the latest approximation to the eigenvector, and the
+*>          index of the eigenvector is returned in IFAIL.
+*>          If JOBZ = 'N', then Z is not referenced.
+*>          Note: the user must ensure that at least max(1,M) columns are
+*>          supplied in the array Z; if RANGE = 'V', the exact value of M
+*>          is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (LWORK)
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = (2KD+1)*N + KD*NTHREADS
+*>                                   where KD is the size of the band.
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK, RWORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (7*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*>          IFAIL is INTEGER array, dimension (N)
+*>          If JOBZ = 'V', then if INFO = 0, the first M elements of
+*>          IFAIL are zero.  If INFO > 0, then IFAIL contains the
+*>          indices of the eigenvectors that failed to converge.
+*>          If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, then i eigenvectors failed to converge.
+*>                Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup complexOTHEReigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB,
+     $                          Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W,
+     $                          Z, LDZ, WORK, LWORK, RWORK, IWORK, 
+     $                          IFAIL, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, RANGE, UPLO
+      INTEGER            IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+      REAL               ABSTOL, VL, VU
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IFAIL( * ), IWORK( * )
+      REAL               RWORK( * ), W( * )
+      COMPLEX            AB( LDAB, * ), Q( LDQ, * ), WORK( * ),
+     $                   Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E0, ONE = 1.0E0 )
+      COMPLEX            CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0E0, 0.0E0 ),
+     $                   CONE = ( 1.0E0, 0.0E0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            ALLEIG, INDEIG, LOWER, TEST, VALEIG, WANTZ,
+     $                   LQUERY
+      CHARACTER          ORDER
+      INTEGER            I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+     $                   INDISP, INDIWK, INDRWK, INDWRK, ISCALE, ITMP1,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS,
+     $                   J, JJ, NSPLIT
+      REAL               ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+     $                   SIGMA, SMLNUM, TMP1, VLL, VUU
+      COMPLEX            CTMP1
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, CLANHB
+      EXTERNAL           LSAME, SLAMCH, CLANHB, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CCOPY,
+     $                   CGEMV, CLACPY, CLASCL, CSTEIN, CSTEQR,
+     $                   CSWAP, CHETRD_HB2ST
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      ALLEIG = LSAME( RANGE, 'A' )
+      VALEIG = LSAME( RANGE, 'V' )
+      INDEIG = LSAME( RANGE, 'I' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( LDAB.LT.KD+1 ) THEN
+         INFO = -7
+      ELSE IF( WANTZ .AND. LDQ.LT.MAX( 1, N ) ) THEN
+         INFO = -9
+      ELSE
+         IF( VALEIG ) THEN
+            IF( N.GT.0 .AND. VU.LE.VL )
+     $         INFO = -11
+         ELSE IF( INDEIG ) THEN
+            IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+               INFO = -12
+            ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+               INFO = -13
+            END IF
+         END IF
+      END IF
+      IF( INFO.EQ.0 ) THEN
+         IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) )
+     $      INFO = -18
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            WORK( 1 ) = LWMIN
+         ELSE
+            IB    = ILAENV( 18, 'CHETRD_HB2ST', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+            LWMIN = LHTRD + LWTRD
+            WORK( 1 )  = LWMIN
+         ENDIF
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -20
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHBEVX_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      M = 0
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         M = 1
+         IF( LOWER ) THEN
+            CTMP1 = AB( 1, 1 )
+         ELSE
+            CTMP1 = AB( KD+1, 1 )
+         END IF
+         TMP1 = REAL( CTMP1 )
+         IF( VALEIG ) THEN
+            IF( .NOT.( VL.LT.TMP1 .AND. VU.GE.TMP1 ) )
+     $         M = 0
+         END IF
+         IF( M.EQ.1 ) THEN
+            W( 1 ) = REAL( CTMP1 )
+            IF( WANTZ )
+     $         Z( 1, 1 ) = CONE
+         END IF
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ISCALE = 0
+      ABSTLL = ABSTOL
+      IF( VALEIG ) THEN
+         VLL = VL
+         VUU = VU
+      ELSE
+         VLL = ZERO
+         VUU = ZERO
+      END IF
+      ANRM = CLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK )
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            CALL CLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         ELSE
+            CALL CLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         END IF
+         IF( ABSTOL.GT.0 )
+     $      ABSTLL = ABSTOL*SIGMA
+         IF( VALEIG ) THEN
+            VLL = VL*SIGMA
+            VUU = VU*SIGMA
+         END IF
+      END IF
+*
+*     Call CHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form.
+*
+      INDD = 1
+      INDE = INDD + N
+      INDRWK = INDE + N
+*
+      INDHOUS = 1
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL CHETRD_HB2ST( 'N', JOBZ, UPLO, N, KD, AB, LDAB,
+     $                    RWORK( INDD ), RWORK( INDE ), WORK( INDHOUS ),
+     $                    LHTRD, WORK( INDWRK ), LLWORK, IINFO )
+*
+*     If all eigenvalues are desired and ABSTOL is less than or equal
+*     to zero, then call SSTERF or CSTEQR.  If this fails for some
+*     eigenvalue, then try SSTEBZ.
+*
+      TEST = .FALSE.
+      IF (INDEIG) THEN
+         IF (IL.EQ.1 .AND. IU.EQ.N) THEN
+            TEST = .TRUE.
+         END IF
+      END IF
+      IF ((ALLEIG .OR. TEST) .AND. (ABSTOL.LE.ZERO)) THEN
+         CALL SCOPY( N, RWORK( INDD ), 1, W, 1 )
+         INDEE = INDRWK + 2*N
+         IF( .NOT.WANTZ ) THEN
+            CALL SCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+            CALL SSTERF( N, W, RWORK( INDEE ), INFO )
+         ELSE
+            CALL CLACPY( 'A', N, N, Q, LDQ, Z, LDZ )
+            CALL SCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+            CALL CSTEQR( JOBZ, N, W, RWORK( INDEE ), Z, LDZ,
+     $                   RWORK( INDRWK ), INFO )
+            IF( INFO.EQ.0 ) THEN
+               DO 10 I = 1, N
+                  IFAIL( I ) = 0
+   10          CONTINUE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            M = N
+            GO TO 30
+         END IF
+         INFO = 0
+      END IF
+*
+*     Otherwise, call SSTEBZ and, if eigenvectors are desired, CSTEIN.
+*
+      IF( WANTZ ) THEN
+         ORDER = 'B'
+      ELSE
+         ORDER = 'E'
+      END IF
+      INDIBL = 1
+      INDISP = INDIBL + N
+      INDIWK = INDISP + N
+      CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+     $             RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W,
+     $             IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ),
+     $             IWORK( INDIWK ), INFO )
+*
+      IF( WANTZ ) THEN
+         CALL CSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W,
+     $                IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+     $                RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO )
+*
+*        Apply unitary matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by CSTEIN.
+*
+         DO 20 J = 1, M
+            CALL CCOPY( N, Z( 1, J ), 1, WORK( 1 ), 1 )
+            CALL CGEMV( 'N', N, N, CONE, Q, LDQ, WORK, 1, CZERO,
+     $                  Z( 1, J ), 1 )
+   20    CONTINUE
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+   30 CONTINUE
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = M
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     If eigenvalues are not in order, then sort them, along with
+*     eigenvectors.
+*
+      IF( WANTZ ) THEN
+         DO 50 J = 1, M - 1
+            I = 0
+            TMP1 = W( J )
+            DO 40 JJ = J + 1, M
+               IF( W( JJ ).LT.TMP1 ) THEN
+                  I = JJ
+                  TMP1 = W( JJ )
+               END IF
+   40       CONTINUE
+*
+            IF( I.NE.0 ) THEN
+               ITMP1 = IWORK( INDIBL+I-1 )
+               W( I ) = W( J )
+               IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+               W( J ) = TMP1
+               IWORK( INDIBL+J-1 ) = ITMP1
+               CALL CSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+               IF( INFO.NE.0 ) THEN
+                  ITMP1 = IFAIL( I )
+                  IFAIL( I ) = IFAIL( J )
+                  IFAIL( J ) = ITMP1
+               END IF
+            END IF
+   50    CONTINUE
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of CHBEVX_2STAGE
+*
+      END
diff --git a/SRC/cheev_2stage.f b/SRC/cheev_2stage.f
new file mode 100644
index 00000000..b98dac76
--- /dev/null
+++ b/SRC/cheev_2stage.f
@@ -0,0 +1,355 @@
+*> \brief <b> CHEEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+*  @generated from zheev_2stage.f, fortran z -> c, Sat Nov  5 23:18:06 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHEEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cheev_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cheev_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cheev_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+*                                RWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, LDA, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       REAL               RWORK( * ), W( * )
+*       COMPLEX            A( LDA, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CHEEV_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA, N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*>          orthonormal eigenvectors of the matrix A.
+*>          If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*>          or the upper triangle (if UPLO='U') of A, including the
+*>          diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (max(1, 3*N-2))
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, the algorithm failed to converge; i
+*>                off-diagonal elements of an intermediate tridiagonal
+*>                form did not converge to zero.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexHEeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+     $                         RWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, LDA, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      REAL               RWORK( * ), W( * )
+      COMPLEX            A( LDA, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E0, ONE = 1.0E0 )
+      COMPLEX            CONE
+      PARAMETER          ( CONE = ( 1.0E0, 0.0E0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, LQUERY, WANTZ
+      INTEGER            IINFO, IMAX, INDE, INDTAU, INDWRK, ISCALE,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+      REAL               ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, CLANHE
+      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANHE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR,
+     $                   CUNGTR, CHETRD_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         KD    = ILAENV( 17, 'CHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+         IB    = ILAENV( 18, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+         LHTRD = ILAENV( 19, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWTRD = ILAENV( 20, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWMIN = N + LHTRD + LWTRD
+         WORK( 1 )  = LWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -8
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHEEV_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 ) THEN
+         RETURN
+      END IF
+*
+      IF( N.EQ.1 ) THEN
+         W( 1 ) = REAL( A( 1, 1 ) )
+         WORK( 1 ) = 1
+         IF( WANTZ )
+     $      A( 1, 1 ) = CONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = CLANHE( 'M', UPLO, N, A, LDA, RWORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 )
+     $   CALL CLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+*     Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDTAU  = 1
+      INDHOUS = INDTAU + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ),
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call SSTERF.  For eigenvectors, first call
+*     CUNGTR to generate the unitary matrix, then call CSTEQR.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL SSTERF( N, W, RWORK( INDE ), INFO )
+      ELSE
+         CALL CUNGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ),
+     $                LLWORK, IINFO )
+         INDWRK = INDE + N
+         CALL CSTEQR( JOBZ, N, W, RWORK( INDE ), A, LDA,
+     $                RWORK( INDWRK ), INFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = N
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     Set WORK(1) to optimal complex workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of CHEEV_2STAGE
+*
+      END
diff --git a/SRC/cheevd_2stage.f b/SRC/cheevd_2stage.f
new file mode 100644
index 00000000..9d1057fc
--- /dev/null
+++ b/SRC/cheevd_2stage.f
@@ -0,0 +1,451 @@
+*> \brief <b> CHEEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+*  @generated from zheevd_2stage.f, fortran z -> c, Sat Nov  5 23:18:14 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHEEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cheevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cheevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cheevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CHEEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+*                          RWORK, LRWORK, IWORK, LIWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, LDA, LIWORK, LRWORK, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IWORK( * )
+*       REAL               RWORK( * ), W( * )
+*       COMPLEX            A( LDA, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  If eigenvectors are desired, it uses a
+*> divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA, N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*>          orthonormal eigenvectors of the matrix A.
+*>          If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*>          or the upper triangle (if UPLO='U') of A, including the
+*>          diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + N+1
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + N+1
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be at least 2*N + N**2
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK, RWORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array,
+*>                                         dimension (LRWORK)
+*>          On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          The dimension of the array RWORK.
+*>          If N <= 1,                LRWORK must be at least 1.
+*>          If JOBZ  = 'N' and N > 1, LRWORK must be at least N.
+*>          If JOBZ  = 'V' and N > 1, LRWORK must be at least
+*>                         1 + 5*N + 2*N**2.
+*>
+*>          If LRWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK, RWORK
+*>          and IWORK arrays, returns these values as the first entries
+*>          of the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.
+*>          If N <= 1,                LIWORK must be at least 1.
+*>          If JOBZ  = 'N' and N > 1, LIWORK must be at least 1.
+*>          If JOBZ  = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK, RWORK
+*>          and IWORK arrays, returns these values as the first entries
+*>          of the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i and JOBZ = 'N', then the algorithm failed
+*>                to converge; i off-diagonal elements of an intermediate
+*>                tridiagonal form did not converge to zero;
+*>                if INFO = i and JOBZ = 'V', then the algorithm failed
+*>                to compute an eigenvalue while working on the submatrix
+*>                lying in rows and columns INFO/(N+1) through
+*>                mod(INFO,N+1).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexHEeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*>  Modified description of INFO. Sven, 16 Feb 05.
+*
+*> \par Contributors:
+*  ==================
+*>
+*> Jeff Rutter, Computer Science Division, University of California
+*> at Berkeley, USA
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CHEEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+     $                   RWORK, LRWORK, IWORK, LIWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, LDA, LIWORK, LRWORK, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      REAL               RWORK( * ), W( * )
+      COMPLEX            A( LDA, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E0, ONE = 1.0E0 )
+      COMPLEX            CONE
+      PARAMETER          ( CONE = ( 1.0E0, 0.0E0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, LQUERY, WANTZ
+      INTEGER            IINFO, IMAX, INDE, INDRWK, INDTAU, INDWK2,
+     $                   INDWRK, ISCALE, LIWMIN, LLRWK, LLWORK,
+     $                   LLWRK2, LRWMIN, LWMIN,
+     $                   LHTRD, LWTRD, KD, IB, INDHOUS
+
+
+      REAL               ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, CLANHE
+      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANHE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SSCAL, SSTERF, XERBLA, CLACPY, CLASCL,
+     $                   CSTEDC, CUNMTR, CHETRD_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 .OR. LIWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            LRWMIN = 1
+            LIWMIN = 1
+         ELSE
+            KD    = ILAENV( 17, 'CHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+            IB    = ILAENV( 18, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            IF( WANTZ ) THEN
+               LWMIN = 2*N + N*N
+               LRWMIN = 1 + 5*N + 2*N**2
+               LIWMIN = 3 + 5*N
+            ELSE
+               LWMIN = N + 1 + LHTRD + LWTRD
+               LRWMIN = N
+               LIWMIN = 1
+            END IF
+         END IF
+         WORK( 1 )  = LWMIN
+         RWORK( 1 ) = LRWMIN
+         IWORK( 1 ) = LIWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -8
+         ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -10
+         ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -12
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHEEVD_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         W( 1 ) = REAL( A( 1, 1 ) )
+         IF( WANTZ )
+     $      A( 1, 1 ) = CONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = CLANHE( 'M', UPLO, N, A, LDA, RWORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 )
+     $   CALL CLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+*     Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDRWK  = INDE + N
+      LLRWK   = LRWORK - INDRWK + 1
+      INDTAU  = 1
+      INDHOUS = INDTAU + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+      INDWK2  = INDWRK + N*N
+      LLWRK2  = LWORK - INDWK2 + 1
+*
+      CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ),
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call SSTERF.  For eigenvectors, first call
+*     CSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
+*     tridiagonal matrix, then call CUNMTR to multiply it to the
+*     Householder transformations represented as Householder vectors in
+*     A.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL SSTERF( N, W, RWORK( INDE ), INFO )
+      ELSE
+         CALL CSTEDC( 'I', N, W, RWORK( INDE ), WORK( INDWRK ), N,
+     $                WORK( INDWK2 ), LLWRK2, RWORK( INDRWK ), LLRWK,
+     $                IWORK, LIWORK, INFO )
+         CALL CUNMTR( 'L', UPLO, 'N', N, N, A, LDA, WORK( INDTAU ),
+     $                WORK( INDWRK ), N, WORK( INDWK2 ), LLWRK2, IINFO )
+         CALL CLACPY( 'A', N, N, WORK( INDWRK ), N, A, LDA )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = N
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+      WORK( 1 )  = LWMIN
+      RWORK( 1 ) = LRWMIN
+      IWORK( 1 ) = LIWMIN
+*
+      RETURN
+*
+*     End of CHEEVD_2STAGE
+*
+      END
diff --git a/SRC/cheevr_2stage.f b/SRC/cheevr_2stage.f
new file mode 100644
index 00000000..23a98389
--- /dev/null
+++ b/SRC/cheevr_2stage.f
@@ -0,0 +1,779 @@
+*> \brief <b> CHEEVR_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+*  @generated from zheevr_2stage.f, fortran z -> c, Sat Nov  5 23:18:11 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHEEVR_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cheevr_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cheevr_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cheevr_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CHEEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+*                                 IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ,
+*                                 WORK, LWORK, RWORK, LRWORK, IWORK,
+*                                 LIWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, RANGE, UPLO
+*       INTEGER            IL, INFO, IU, LDA, LDZ, LIWORK, LRWORK, LWORK,
+*      $                   M, N
+*       REAL               ABSTOL, VL, VU
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            ISUPPZ( * ), IWORK( * )
+*       REAL               RWORK( * ), W( * )
+*       COMPLEX            A( LDA, * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CHEEVR_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  Eigenvalues and eigenvectors can
+*> be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*>
+*> CHEEVR_2STAGE first reduces the matrix A to tridiagonal form T with a call
+*> to CHETRD.  Then, whenever possible, CHEEVR_2STAGE calls CSTEMR to compute
+*> eigenspectrum using Relatively Robust Representations.  CSTEMR
+*> computes eigenvalues by the dqds algorithm, while orthogonal
+*> eigenvectors are computed from various "good" L D L^T representations
+*> (also known as Relatively Robust Representations). Gram-Schmidt
+*> orthogonalization is avoided as far as possible. More specifically,
+*> the various steps of the algorithm are as follows.
+*>
+*> For each unreduced block (submatrix) of T,
+*>    (a) Compute T - sigma I  = L D L^T, so that L and D
+*>        define all the wanted eigenvalues to high relative accuracy.
+*>        This means that small relative changes in the entries of D and L
+*>        cause only small relative changes in the eigenvalues and
+*>        eigenvectors. The standard (unfactored) representation of the
+*>        tridiagonal matrix T does not have this property in general.
+*>    (b) Compute the eigenvalues to suitable accuracy.
+*>        If the eigenvectors are desired, the algorithm attains full
+*>        accuracy of the computed eigenvalues only right before
+*>        the corresponding vectors have to be computed, see steps c) and d).
+*>    (c) For each cluster of close eigenvalues, select a new
+*>        shift close to the cluster, find a new factorization, and refine
+*>        the shifted eigenvalues to suitable accuracy.
+*>    (d) For each eigenvalue with a large enough relative separation compute
+*>        the corresponding eigenvector by forming a rank revealing twisted
+*>        factorization. Go back to (c) for any clusters that remain.
+*>
+*> The desired accuracy of the output can be specified by the input
+*> parameter ABSTOL.
+*>
+*> For more details, see DSTEMR's documentation and:
+*> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+*>   to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+*>   Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+*> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+*>   Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+*>   2004.  Also LAPACK Working Note 154.
+*> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+*>   tridiagonal eigenvalue/eigenvector problem",
+*>   Computer Science Division Technical Report No. UCB/CSD-97-971,
+*>   UC Berkeley, May 1997.
+*>
+*>
+*> Note 1 : CHEEVR_2STAGE calls CSTEMR when the full spectrum is requested
+*> on machines which conform to the ieee-754 floating point standard.
+*> CHEEVR_2STAGE calls SSTEBZ and CSTEIN on non-ieee machines and
+*> when partial spectrum requests are made.
+*>
+*> Normal execution of CSTEMR may create NaNs and infinities and
+*> hence may abort due to a floating point exception in environments
+*> which do not handle NaNs and infinities in the ieee standard default
+*> manner.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*>          RANGE is CHARACTER*1
+*>          = 'A': all eigenvalues will be found.
+*>          = 'V': all eigenvalues in the half-open interval (VL,VU]
+*>                 will be found.
+*>          = 'I': the IL-th through IU-th eigenvalues will be found.
+*>          For RANGE = 'V' or 'I' and IU - IL < N - 1, SSTEBZ and
+*>          CSTEIN are called
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA, N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, the lower triangle (if UPLO='L') or the upper
+*>          triangle (if UPLO='U') of A, including the diagonal, is
+*>          destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*>          VL is REAL
+*>          If RANGE='V', the lower bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*>          VU is REAL
+*>          If RANGE='V', the upper bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*>          IL is INTEGER
+*>          If RANGE='I', the index of the
+*>          smallest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*>          IU is INTEGER
+*>          If RANGE='I', the index of the
+*>          largest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL
+*>          The absolute error tolerance for the eigenvalues.
+*>          An approximate eigenvalue is accepted as converged
+*>          when it is determined to lie in an interval [a,b]
+*>          of width less than or equal to
+*>
+*>                  ABSTOL + EPS *   max( |a|,|b| ) ,
+*>
+*>          where EPS is the machine precision.  If ABSTOL is less than
+*>          or equal to zero, then  EPS*|T|  will be used in its place,
+*>          where |T| is the 1-norm of the tridiagonal matrix obtained
+*>          by reducing A to tridiagonal form.
+*>
+*>          See "Computing Small Singular Values of Bidiagonal Matrices
+*>          with Guaranteed High Relative Accuracy," by Demmel and
+*>          Kahan, LAPACK Working Note #3.
+*>
+*>          If high relative accuracy is important, set ABSTOL to
+*>          SLAMCH( 'Safe minimum' ).  Doing so will guarantee that
+*>          eigenvalues are computed to high relative accuracy when
+*>          possible in future releases.  The current code does not
+*>          make any guarantees about high relative accuracy, but
+*>          furutre releases will. See J. Barlow and J. Demmel,
+*>          "Computing Accurate Eigensystems of Scaled Diagonally
+*>          Dominant Matrices", LAPACK Working Note #7, for a discussion
+*>          of which matrices define their eigenvalues to high relative
+*>          accuracy.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*>          M is INTEGER
+*>          The total number of eigenvalues found.  0 <= M <= N.
+*>          If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          The first M elements contain the selected eigenvalues in
+*>          ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is COMPLEX array, dimension (LDZ, max(1,M))
+*>          If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*>          contain the orthonormal eigenvectors of the matrix A
+*>          corresponding to the selected eigenvalues, with the i-th
+*>          column of Z holding the eigenvector associated with W(i).
+*>          If JOBZ = 'N', then Z is not referenced.
+*>          Note: the user must ensure that at least max(1,M) columns are
+*>          supplied in the array Z; if RANGE = 'V', the exact value of M
+*>          is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] ISUPPZ
+*> \verbatim
+*>          ISUPPZ is INTEGER array, dimension ( 2*max(1,M) )
+*>          The support of the eigenvectors in Z, i.e., the indices
+*>          indicating the nonzero elements in Z. The i-th eigenvector
+*>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
+*>          ISUPPZ( 2*i ). This is an output of CSTEMR (tridiagonal
+*>          matrix). The support of the eigenvectors of A is typically 
+*>          1:N because of the unitary transformations applied by CUNMTR.
+*>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, 26*N, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK, RWORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (MAX(1,LRWORK))
+*>          On exit, if INFO = 0, RWORK(1) returns the optimal
+*>          (and minimal) LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>
+*>          If LRWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK, RWORK
+*>          and IWORK arrays, returns these values as the first entries
+*>          of the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal
+*>          (and minimal) LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK, RWORK
+*>          and IWORK arrays, returns these values as the first entries
+*>          of the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  Internal error
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup complexHEeigen
+*
+*> \par Contributors:
+*  ==================
+*>
+*>     Inderjit Dhillon, IBM Almaden, USA \n
+*>     Osni Marques, LBNL/NERSC, USA \n
+*>     Ken Stanley, Computer Science Division, University of
+*>       California at Berkeley, USA \n
+*>     Jason Riedy, Computer Science Division, University of
+*>       California at Berkeley, USA \n
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CHEEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+     $                          IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ,
+     $                          WORK, LWORK, RWORK, LRWORK, IWORK,
+     $                          LIWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, RANGE, UPLO
+      INTEGER            IL, INFO, IU, LDA, LDZ, LIWORK, LRWORK, LWORK,
+     $                   M, N
+      REAL               ABSTOL, VL, VU
+*     ..
+*     .. Array Arguments ..
+      INTEGER            ISUPPZ( * ), IWORK( * )
+      REAL               RWORK( * ), W( * )
+      COMPLEX            A( LDA, * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG,
+     $                   WANTZ, TRYRAC
+      CHARACTER          ORDER
+      INTEGER            I, IEEEOK, IINFO, IMAX, INDIBL, INDIFL, INDISP,
+     $                   INDIWO, INDRD, INDRDD, INDRE, INDREE, INDRWK,
+     $                   INDTAU, INDWK, INDWKN, ISCALE, ITMP1, J, JJ,
+     $                   LIWMIN, LLWORK, LLRWORK, LLWRKN, LRWMIN,
+     $                   LWMIN, NSPLIT, LHTRD, LWTRD, KD, IB, INDHOUS
+      REAL               ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+     $                   SIGMA, SMLNUM, TMP1, VLL, VUU
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, CLANSY
+      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL,
+     $                   CHETRD_2STAGE, CSTEMR, CSTEIN, CSWAP, CUNMTR
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IEEEOK = ILAENV( 10, 'CHEEVR', 'N', 1, 2, 3, 4 )
+*
+      LOWER = LSAME( UPLO, 'L' )
+      WANTZ = LSAME( JOBZ, 'V' )
+      ALLEIG = LSAME( RANGE, 'A' )
+      VALEIG = LSAME( RANGE, 'V' )
+      INDEIG = LSAME( RANGE, 'I' )
+*
+      LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR.
+     $         ( LIWORK.EQ.-1 ) )
+*
+      KD     = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+      IB     = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+      LHTRD  = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+      LWTRD  = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+      LWMIN  = N + LHTRD + LWTRD
+      LRWMIN = MAX( 1, 24*N )
+      LIWMIN = MAX( 1, 10*N )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE
+         IF( VALEIG ) THEN
+            IF( N.GT.0 .AND. VU.LE.VL )
+     $         INFO = -8
+         ELSE IF( INDEIG ) THEN
+            IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+               INFO = -9
+            ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+               INFO = -10
+            END IF
+         END IF
+      END IF
+      IF( INFO.EQ.0 ) THEN
+         IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         WORK( 1 )  = LWMIN
+         RWORK( 1 ) = LRWMIN
+         IWORK( 1 ) = LIWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -18
+         ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -20
+         ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -22
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHEEVR_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      M = 0
+      IF( N.EQ.0 ) THEN
+         WORK( 1 ) = 1
+         RETURN
+      END IF
+*
+      IF( N.EQ.1 ) THEN
+         WORK( 1 ) = 2
+         IF( ALLEIG .OR. INDEIG ) THEN
+            M = 1
+            W( 1 ) = REAL( A( 1, 1 ) )
+         ELSE
+            IF( VL.LT.REAL( A( 1, 1 ) ) .AND. VU.GE.REAL( A( 1, 1 ) ) )
+     $           THEN
+               M = 1
+               W( 1 ) = REAL( A( 1, 1 ) )
+            END IF
+         END IF
+         IF( WANTZ ) THEN
+            Z( 1, 1 ) = ONE
+            ISUPPZ( 1 ) = 1
+            ISUPPZ( 2 ) = 1
+         END IF
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ISCALE = 0
+      ABSTLL = ABSTOL
+      IF (VALEIG) THEN
+         VLL = VL
+         VUU = VU
+      END IF
+      ANRM = CLANSY( 'M', UPLO, N, A, LDA, RWORK )
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            DO 10 J = 1, N
+               CALL CSSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+   10       CONTINUE
+         ELSE
+            DO 20 J = 1, N
+               CALL CSSCAL( J, SIGMA, A( 1, J ), 1 )
+   20       CONTINUE
+         END IF
+         IF( ABSTOL.GT.0 )
+     $      ABSTLL = ABSTOL*SIGMA
+         IF( VALEIG ) THEN
+            VLL = VL*SIGMA
+            VUU = VU*SIGMA
+         END IF
+      END IF
+
+*     Initialize indices into workspaces.  Note: The IWORK indices are
+*     used only if SSTERF or CSTEMR fail.
+
+*     WORK(INDTAU:INDTAU+N-1) stores the complex scalar factors of the
+*     elementary reflectors used in CHETRD.
+      INDTAU = 1
+*     INDWK is the starting offset of the remaining complex workspace,
+*     and LLWORK is the remaining complex workspace size.
+      INDHOUS = INDTAU + N
+      INDWK   = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWK + 1
+
+*     RWORK(INDRD:INDRD+N-1) stores the real tridiagonal's diagonal
+*     entries.
+      INDRD = 1
+*     RWORK(INDRE:INDRE+N-1) stores the off-diagonal entries of the
+*     tridiagonal matrix from CHETRD.
+      INDRE = INDRD + N
+*     RWORK(INDRDD:INDRDD+N-1) is a copy of the diagonal entries over
+*     -written by CSTEMR (the SSTERF path copies the diagonal to W).
+      INDRDD = INDRE + N
+*     RWORK(INDREE:INDREE+N-1) is a copy of the off-diagonal entries over
+*     -written while computing the eigenvalues in SSTERF and CSTEMR.
+      INDREE = INDRDD + N
+*     INDRWK is the starting offset of the left-over real workspace, and
+*     LLRWORK is the remaining workspace size.
+      INDRWK = INDREE + N
+      LLRWORK = LRWORK - INDRWK + 1
+
+*     IWORK(INDIBL:INDIBL+M-1) corresponds to IBLOCK in SSTEBZ and
+*     stores the block indices of each of the M<=N eigenvalues.
+      INDIBL = 1
+*     IWORK(INDISP:INDISP+NSPLIT-1) corresponds to ISPLIT in SSTEBZ and
+*     stores the starting and finishing indices of each block.
+      INDISP = INDIBL + N
+*     IWORK(INDIFL:INDIFL+N-1) stores the indices of eigenvectors
+*     that corresponding to eigenvectors that fail to converge in
+*     CSTEIN.  This information is discarded; if any fail, the driver
+*     returns INFO > 0.
+      INDIFL = INDISP + N
+*     INDIWO is the offset of the remaining integer workspace.
+      INDIWO = INDIFL + N
+
+*
+*     Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+      CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), 
+     $                    RWORK( INDRE ), WORK( INDTAU ),
+     $                    WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWK ), LLWORK, IINFO )
+*
+*     If all eigenvalues are desired
+*     then call SSTERF or CSTEMR and CUNMTR.
+*
+      TEST = .FALSE.
+      IF( INDEIG ) THEN
+         IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+            TEST = .TRUE.
+         END IF
+      END IF
+      IF( ( ALLEIG.OR.TEST ) .AND. ( IEEEOK.EQ.1 ) ) THEN
+         IF( .NOT.WANTZ ) THEN
+            CALL SCOPY( N, RWORK( INDRD ), 1, W, 1 )
+            CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 )
+            CALL SSTERF( N, W, RWORK( INDREE ), INFO )
+         ELSE
+            CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 )
+            CALL SCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 )
+*
+            IF (ABSTOL .LE. TWO*N*EPS) THEN
+               TRYRAC = .TRUE.
+            ELSE
+               TRYRAC = .FALSE.
+            END IF
+            CALL CSTEMR( JOBZ, 'A', N, RWORK( INDRDD ),
+     $                   RWORK( INDREE ), VL, VU, IL, IU, M, W,
+     $                   Z, LDZ, N, ISUPPZ, TRYRAC,
+     $                   RWORK( INDRWK ), LLRWORK,
+     $                   IWORK, LIWORK, INFO )
+*
+*           Apply unitary matrix used in reduction to tridiagonal
+*           form to eigenvectors returned by CSTEMR.
+*
+            IF( WANTZ .AND. INFO.EQ.0 ) THEN
+               INDWKN = INDWK
+               LLWRKN = LWORK - INDWKN + 1
+               CALL CUNMTR( 'L', UPLO, 'N', N, M, A, LDA,
+     $                      WORK( INDTAU ), Z, LDZ, WORK( INDWKN ),
+     $                      LLWRKN, IINFO )
+            END IF
+         END IF
+*
+*
+         IF( INFO.EQ.0 ) THEN
+            M = N
+            GO TO 30
+         END IF
+         INFO = 0
+      END IF
+*
+*     Otherwise, call SSTEBZ and, if eigenvectors are desired, CSTEIN.
+*     Also call SSTEBZ and CSTEIN if CSTEMR fails.
+*
+      IF( WANTZ ) THEN
+         ORDER = 'B'
+      ELSE
+         ORDER = 'E'
+      END IF
+
+      CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+     $             RWORK( INDRD ), RWORK( INDRE ), M, NSPLIT, W,
+     $             IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ),
+     $             IWORK( INDIWO ), INFO )
+*
+      IF( WANTZ ) THEN
+         CALL CSTEIN( N, RWORK( INDRD ), RWORK( INDRE ), M, W,
+     $                IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+     $                RWORK( INDRWK ), IWORK( INDIWO ), IWORK( INDIFL ),
+     $                INFO )
+*
+*        Apply unitary matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by CSTEIN.
+*
+         INDWKN = INDWK
+         LLWRKN = LWORK - INDWKN + 1
+         CALL CUNMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+     $                LDZ, WORK( INDWKN ), LLWRKN, IINFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+   30 CONTINUE
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = M
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     If eigenvalues are not in order, then sort them, along with
+*     eigenvectors.
+*
+      IF( WANTZ ) THEN
+         DO 50 J = 1, M - 1
+            I = 0
+            TMP1 = W( J )
+            DO 40 JJ = J + 1, M
+               IF( W( JJ ).LT.TMP1 ) THEN
+                  I = JJ
+                  TMP1 = W( JJ )
+               END IF
+   40       CONTINUE
+*
+            IF( I.NE.0 ) THEN
+               ITMP1 = IWORK( INDIBL+I-1 )
+               W( I ) = W( J )
+               IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+               W( J ) = TMP1
+               IWORK( INDIBL+J-1 ) = ITMP1
+               CALL CSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+            END IF
+   50    CONTINUE
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 )  = LWMIN
+      RWORK( 1 ) = LRWMIN
+      IWORK( 1 ) = LIWMIN
+*
+      RETURN
+*
+*     End of CHEEVR_2STAGE
+*
+      END
diff --git a/SRC/cheevx_2stage.f b/SRC/cheevx_2stage.f
new file mode 100644
index 00000000..84ae438d
--- /dev/null
+++ b/SRC/cheevx_2stage.f
@@ -0,0 +1,618 @@
+*> \brief <b> CHEEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+*  @generated from zheevx_2stage.f, fortran z -> c, Sat Nov  5 23:18:09 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHEEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cheevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cheevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cheevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CHEEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+*                                 IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+*                                 LWORK, RWORK, IWORK, IFAIL, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, RANGE, UPLO
+*       INTEGER            IL, INFO, IU, LDA, LDZ, LWORK, M, N
+*       REAL               ABSTOL, VL, VU
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IFAIL( * ), IWORK( * )
+*       REAL               RWORK( * ), W( * )
+*       COMPLEX            A( LDA, * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CHEEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  Eigenvalues and eigenvectors can
+*> be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*>          RANGE is CHARACTER*1
+*>          = 'A': all eigenvalues will be found.
+*>          = 'V': all eigenvalues in the half-open interval (VL,VU]
+*>                 will be found.
+*>          = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA, N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, the lower triangle (if UPLO='L') or the upper
+*>          triangle (if UPLO='U') of A, including the diagonal, is
+*>          destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*>          VL is REAL
+*>          If RANGE='V', the lower bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*>          VU is REAL
+*>          If RANGE='V', the upper bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*>          IL is INTEGER
+*>          If RANGE='I', the index of the
+*>          smallest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*>          IU is INTEGER
+*>          If RANGE='I', the index of the
+*>          largest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL
+*>          The absolute error tolerance for the eigenvalues.
+*>          An approximate eigenvalue is accepted as converged
+*>          when it is determined to lie in an interval [a,b]
+*>          of width less than or equal to
+*>
+*>                  ABSTOL + EPS *   max( |a|,|b| ) ,
+*>
+*>          where EPS is the machine precision.  If ABSTOL is less than
+*>          or equal to zero, then  EPS*|T|  will be used in its place,
+*>          where |T| is the 1-norm of the tridiagonal matrix obtained
+*>          by reducing A to tridiagonal form.
+*>
+*>          Eigenvalues will be computed most accurately when ABSTOL is
+*>          set to twice the underflow threshold 2*SLAMCH('S'), not zero.
+*>          If this routine returns with INFO>0, indicating that some
+*>          eigenvectors did not converge, try setting ABSTOL to
+*>          2*SLAMCH('S').
+*>
+*>          See "Computing Small Singular Values of Bidiagonal Matrices
+*>          with Guaranteed High Relative Accuracy," by Demmel and
+*>          Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*>          M is INTEGER
+*>          The total number of eigenvalues found.  0 <= M <= N.
+*>          If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          On normal exit, the first M elements contain the selected
+*>          eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is COMPLEX array, dimension (LDZ, max(1,M))
+*>          If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*>          contain the orthonormal eigenvectors of the matrix A
+*>          corresponding to the selected eigenvalues, with the i-th
+*>          column of Z holding the eigenvector associated with W(i).
+*>          If an eigenvector fails to converge, then that column of Z
+*>          contains the latest approximation to the eigenvector, and the
+*>          index of the eigenvector is returned in IFAIL.
+*>          If JOBZ = 'N', then Z is not referenced.
+*>          Note: the user must ensure that at least max(1,M) columns are
+*>          supplied in the array Z; if RANGE = 'V', the exact value of M
+*>          is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, 8*N, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (7*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*>          IFAIL is INTEGER array, dimension (N)
+*>          If JOBZ = 'V', then if INFO = 0, the first M elements of
+*>          IFAIL are zero.  If INFO > 0, then IFAIL contains the
+*>          indices of the eigenvectors that failed to converge.
+*>          If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, then i eigenvectors failed to converge.
+*>                Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup complexHEeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CHEEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+     $                          IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+     $                          LWORK, RWORK, IWORK, IFAIL, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, RANGE, UPLO
+      INTEGER            IL, INFO, IU, LDA, LDZ, LWORK, M, N
+      REAL               ABSTOL, VL, VU
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IFAIL( * ), IWORK( * )
+      REAL               RWORK( * ), W( * )
+      COMPLEX            A( LDA, * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      COMPLEX            CONE
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG,
+     $                   WANTZ
+      CHARACTER          ORDER
+      INTEGER            I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+     $                   INDISP, INDIWK, INDRWK, INDTAU, INDWRK, ISCALE,
+     $                   ITMP1, J, JJ, LLWORK, 
+     $                   NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+      REAL               ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+     $                   SIGMA, SMLNUM, TMP1, VLL, VUU
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, CLANHE
+      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANHE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL,
+     $                   CLACPY, CSTEIN, CSTEQR, CSWAP, CUNGTR, CUNMTR,
+     $                   CHETRD_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      LOWER = LSAME( UPLO, 'L' )
+      WANTZ = LSAME( JOBZ, 'V' )
+      ALLEIG = LSAME( RANGE, 'A' )
+      VALEIG = LSAME( RANGE, 'V' )
+      INDEIG = LSAME( RANGE, 'I' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE
+         IF( VALEIG ) THEN
+            IF( N.GT.0 .AND. VU.LE.VL )
+     $         INFO = -8
+         ELSE IF( INDEIG ) THEN
+            IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+               INFO = -9
+            ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+               INFO = -10
+            END IF
+         END IF
+      END IF
+      IF( INFO.EQ.0 ) THEN
+         IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            WORK( 1 ) = LWMIN
+         ELSE
+            KD    = ILAENV( 17, 'CHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+            IB    = ILAENV( 18, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            LWMIN = N + LHTRD + LWTRD
+            WORK( 1 )  = LWMIN
+         END IF
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -17
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHEEVX_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      M = 0
+      IF( N.EQ.0 ) THEN
+         RETURN
+      END IF
+*
+      IF( N.EQ.1 ) THEN
+         IF( ALLEIG .OR. INDEIG ) THEN
+            M = 1
+            W( 1 ) = REAL( A( 1, 1 ) )
+         ELSE IF( VALEIG ) THEN
+            IF( VL.LT.REAL( A( 1, 1 ) ) .AND. VU.GE.REAL( A( 1, 1 ) ) )
+     $           THEN
+               M = 1
+               W( 1 ) = REAL( A( 1, 1 ) )
+            END IF
+         END IF
+         IF( WANTZ )
+     $      Z( 1, 1 ) = CONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ISCALE = 0
+      ABSTLL = ABSTOL
+      IF( VALEIG ) THEN
+         VLL = VL
+         VUU = VU
+      END IF
+      ANRM = CLANHE( 'M', UPLO, N, A, LDA, RWORK )
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            DO 10 J = 1, N
+               CALL CSSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+   10       CONTINUE
+         ELSE
+            DO 20 J = 1, N
+               CALL CSSCAL( J, SIGMA, A( 1, J ), 1 )
+   20       CONTINUE
+         END IF
+         IF( ABSTOL.GT.0 )
+     $      ABSTLL = ABSTOL*SIGMA
+         IF( VALEIG ) THEN
+            VLL = VL*SIGMA
+            VUU = VU*SIGMA
+         END IF
+      END IF
+*
+*     Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+      INDD    = 1
+      INDE    = INDD + N
+      INDRWK  = INDE + N
+      INDTAU  = 1
+      INDHOUS = INDTAU + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDD ),
+     $                    RWORK( INDE ), WORK( INDTAU ), 
+     $                    WORK( INDHOUS ), LHTRD, WORK( INDWRK ),
+     $                    LLWORK, IINFO )
+*
+*     If all eigenvalues are desired and ABSTOL is less than or equal to
+*     zero, then call SSTERF or CUNGTR and CSTEQR.  If this fails for
+*     some eigenvalue, then try SSTEBZ.
+*
+      TEST = .FALSE.
+      IF( INDEIG ) THEN
+         IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+            TEST = .TRUE.
+         END IF
+      END IF
+      IF( ( ALLEIG .OR. TEST ) .AND. ( ABSTOL.LE.ZERO ) ) THEN
+         CALL SCOPY( N, RWORK( INDD ), 1, W, 1 )
+         INDEE = INDRWK + 2*N
+         IF( .NOT.WANTZ ) THEN
+            CALL SCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+            CALL SSTERF( N, W, RWORK( INDEE ), INFO )
+         ELSE
+            CALL CLACPY( 'A', N, N, A, LDA, Z, LDZ )
+            CALL CUNGTR( UPLO, N, Z, LDZ, WORK( INDTAU ),
+     $                   WORK( INDWRK ), LLWORK, IINFO )
+            CALL SCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+            CALL CSTEQR( JOBZ, N, W, RWORK( INDEE ), Z, LDZ,
+     $                   RWORK( INDRWK ), INFO )
+            IF( INFO.EQ.0 ) THEN
+               DO 30 I = 1, N
+                  IFAIL( I ) = 0
+   30          CONTINUE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            M = N
+            GO TO 40
+         END IF
+         INFO = 0
+      END IF
+*
+*     Otherwise, call SSTEBZ and, if eigenvectors are desired, CSTEIN.
+*
+      IF( WANTZ ) THEN
+         ORDER = 'B'
+      ELSE
+         ORDER = 'E'
+      END IF
+      INDIBL = 1
+      INDISP = INDIBL + N
+      INDIWK = INDISP + N
+      CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+     $             RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W,
+     $             IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ),
+     $             IWORK( INDIWK ), INFO )
+*
+      IF( WANTZ ) THEN
+         CALL CSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W,
+     $                IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+     $                RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO )
+*
+*        Apply unitary matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by CSTEIN.
+*
+         CALL CUNMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+     $                LDZ, WORK( INDWRK ), LLWORK, IINFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+   40 CONTINUE
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = M
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     If eigenvalues are not in order, then sort them, along with
+*     eigenvectors.
+*
+      IF( WANTZ ) THEN
+         DO 60 J = 1, M - 1
+            I = 0
+            TMP1 = W( J )
+            DO 50 JJ = J + 1, M
+               IF( W( JJ ).LT.TMP1 ) THEN
+                  I = JJ
+                  TMP1 = W( JJ )
+               END IF
+   50       CONTINUE
+*
+            IF( I.NE.0 ) THEN
+               ITMP1 = IWORK( INDIBL+I-1 )
+               W( I ) = W( J )
+               IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+               W( J ) = TMP1
+               IWORK( INDIBL+J-1 ) = ITMP1
+               CALL CSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+               IF( INFO.NE.0 ) THEN
+                  ITMP1 = IFAIL( I )
+                  IFAIL( I ) = IFAIL( J )
+                  IFAIL( J ) = ITMP1
+               END IF
+            END IF
+   60    CONTINUE
+      END IF
+*
+*     Set WORK(1) to optimal complex workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of CHEEVX_2STAGE
+*
+      END
diff --git a/SRC/chegv_2stage.f b/SRC/chegv_2stage.f
new file mode 100644
index 00000000..71d58d74
--- /dev/null
+++ b/SRC/chegv_2stage.f
@@ -0,0 +1,379 @@
+*> \brief \b CHEGV_2STAGE
+*
+*  @generated from zhegv_2stage.f, fortran z -> c, Sun Nov  6 13:09:52 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHEGV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chegv_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chegv_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chegv_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CHEGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+*                                WORK, LWORK, RWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, ITYPE, LDA, LDB, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       REAL               RWORK( * ), W( * )
+*       COMPLEX            A( LDA, * ), B( LDB, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CHEGV_2STAGE computes all the eigenvalues, and optionally, the eigenvectors
+*> of a complex generalized Hermitian-definite eigenproblem, of the form
+*> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+*> Here A and B are assumed to be Hermitian and B is also
+*> positive definite.
+*> This routine use the 2stage technique for the reduction to tridiagonal
+*> which showed higher performance on recent architecture and for large
+*  sizes N>2000.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] ITYPE
+*> \verbatim
+*>          ITYPE is INTEGER
+*>          Specifies the problem type to be solved:
+*>          = 1:  A*x = (lambda)*B*x
+*>          = 2:  A*B*x = (lambda)*x
+*>          = 3:  B*A*x = (lambda)*x
+*> \endverbatim
+*>
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangles of A and B are stored;
+*>          = 'L':  Lower triangles of A and B are stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrices A and B.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA, N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>
+*>          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*>          matrix Z of eigenvectors.  The eigenvectors are normalized
+*>          as follows:
+*>          if ITYPE = 1 or 2, Z**H*B*Z = I;
+*>          if ITYPE = 3, Z**H*inv(B)*Z = I.
+*>          If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
+*>          or the lower triangle (if UPLO='L') of A, including the
+*>          diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (LDB, N)
+*>          On entry, the Hermitian positive definite matrix B.
+*>          If UPLO = 'U', the leading N-by-N upper triangular part of B
+*>          contains the upper triangular part of the matrix B.
+*>          If UPLO = 'L', the leading N-by-N lower triangular part of B
+*>          contains the lower triangular part of the matrix B.
+*>
+*>          On exit, if INFO <= N, the part of B containing the matrix is
+*>          overwritten by the triangular factor U or L from the Cholesky
+*>          factorization B = U**H*U or B = L*L**H.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (max(1, 3*N-2))
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  CPOTRF or CHEEV returned an error code:
+*>             <= N:  if INFO = i, CHEEV failed to converge;
+*>                    i off-diagonal elements of an intermediate
+*>                    tridiagonal form did not converge to zero;
+*>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
+*>                    minor of order i of B is not positive definite.
+*>                    The factorization of B could not be completed and
+*>                    no eigenvalues or eigenvectors were computed.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexHEeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CHEGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+     $                         WORK, LWORK, RWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, ITYPE, LDA, LDB, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      REAL               RWORK( * ), W( * )
+      COMPLEX            A( LDA, * ), B( LDB, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER          ( ONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, UPPER, WANTZ
+      CHARACTER          TRANS
+      INTEGER            NEIG, LWMIN, LHTRD, LWTRD, KD, IB
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, CHEGST, CPOTRF, CTRMM, CTRSM,
+     $                   CHEEV_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      UPPER = LSAME( UPLO, 'U' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( ITYPE.LT.1 .OR. ITYPE.GT.3 ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( UPPER .OR. LSAME( UPLO, 'L' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         KD    = ILAENV( 17, 'CHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+         IB    = ILAENV( 18, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+         LHTRD = ILAENV( 19, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWTRD = ILAENV( 20, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWMIN = N + LHTRD + LWTRD
+         WORK( 1 )  = LWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHEGV_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+*     Form a Cholesky factorization of B.
+*
+      CALL CPOTRF( UPLO, N, B, LDB, INFO )
+      IF( INFO.NE.0 ) THEN
+         INFO = N + INFO
+         RETURN
+      END IF
+*
+*     Transform problem to standard eigenvalue problem and solve.
+*
+      CALL CHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
+      CALL CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, 
+     $                   WORK, LWORK, RWORK, INFO )
+*
+      IF( WANTZ ) THEN
+*
+*        Backtransform eigenvectors to the original problem.
+*
+         NEIG = N
+         IF( INFO.GT.0 )
+     $      NEIG = INFO - 1
+         IF( ITYPE.EQ.1 .OR. ITYPE.EQ.2 ) THEN
+*
+*           For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
+*           backtransform eigenvectors: x = inv(L)**H *y or inv(U)*y
+*
+            IF( UPPER ) THEN
+               TRANS = 'N'
+            ELSE
+               TRANS = 'C'
+            END IF
+*
+            CALL CTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+     $                  B, LDB, A, LDA )
+*
+         ELSE IF( ITYPE.EQ.3 ) THEN
+*
+*           For B*A*x=(lambda)*x;
+*           backtransform eigenvectors: x = L*y or U**H *y
+*
+            IF( UPPER ) THEN
+               TRANS = 'C'
+            ELSE
+               TRANS = 'N'
+            END IF
+*
+            CALL CTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+     $                  B, LDB, A, LDA )
+         END IF
+      END IF
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of CHEGV_2STAGE
+*
+      END
diff --git a/SRC/chetrd_2stage.f b/SRC/chetrd_2stage.f
new file mode 100644
index 00000000..795462c6
--- /dev/null
+++ b/SRC/chetrd_2stage.f
@@ -0,0 +1,337 @@
+*> \brief \b CHETRD_2STAGE
+*
+*  @generated from zhetrd_2stage.f, fortran z -> c, Sun Nov  6 19:34:06 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download CHETRD_2STAGE + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd_2stage.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd_2stage.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd_2stage.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       CHARACTER          VECT, UPLO
+*       INTEGER            N, LDA, LWORK, LHOUS2, INFO
+*      ..
+*      .. Array Arguments ..
+*       REAL               D( * ), E( * )
+*       COMPLEX            A( LDA, * ), TAU( * ),
+*                          HOUS2( * ), WORK( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CHETRD_2STAGE reduces a complex Hermitian matrix A to real symmetric
+*> tridiagonal form T by a unitary similarity transformation:
+*> Q1**H Q2**H* A * Q2 * Q1 = T.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] VECT
+*> \verbatim
+*>          VECT is CHARACTER*1
+*>          = 'N':  No need for the Housholder representation, 
+*>                  in particular for the second stage (Band to
+*>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
+*>          = 'V':  the Householder representation is needed to 
+*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>                  then LHOUS2 is to be queried and computed.
+*>                  (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+*>          N-by-N upper triangular part of A contains the upper
+*>          triangular part of the matrix A, and the strictly lower
+*>          triangular part of A is not referenced.  If UPLO = 'L', the
+*>          leading N-by-N lower triangular part of A contains the lower
+*>          triangular part of the matrix A, and the strictly upper
+*>          triangular part of A is not referenced.
+*>          On exit, if UPLO = 'U', the band superdiagonal
+*>          of A are overwritten by the corresponding elements of the
+*>          internal band-diagonal matrix AB, and the elements above 
+*>          the KD superdiagonal, with the array TAU, represent the unitary
+*>          matrix Q1 as a product of elementary reflectors; if UPLO
+*>          = 'L', the diagonal and band subdiagonal of A are over-
+*>          written by the corresponding elements of the internal band-diagonal
+*>          matrix AB, and the elements below the KD subdiagonal, with
+*>          the array TAU, represent the unitary matrix Q1 as a product
+*>          of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is REAL array, dimension (N)
+*>          The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*>          E is REAL array, dimension (N-1)
+*>          The off-diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (N-KD)
+*>          The scalar factors of the elementary reflectors of 
+*>          the first stage (see Further Details).
+*> \endverbatim
+*>
+*> \param[out] HOUS2
+*> \verbatim
+*>          HOUS2 is COMPLEX array, dimension LHOUS2, that
+*>          store the Householder representation of the stage2
+*>          band to tridiagonal.
+*> \endverbatim
+*>
+*> \param[in] LHOUS2
+*> \verbatim
+*>          LHOUS2 is INTEGER
+*>          The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS2=-1,
+*>          then a query is assumed; the routine
+*>          only calculates the optimal size of the HOUS2 array, returns
+*>          this value as the first entry of the HOUS2 array, and no error
+*>          message related to LHOUS2 is issued by XERBLA.
+*>          LHOUS2 = MAX(1, dimension) where
+*>          dimension = 4*N if VECT='N'
+*>          not available now if VECT='H'
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS2=-1,
+*>          then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>          LWORK = MAX(1, dimension) where
+*>          dimension   = max(stage1,stage2) + (KD+1)*N
+*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                        + max(2*KD*KD, KD*NTHREADS) 
+*>                        + (KD+1)*N 
+*>          where KD is the blocking size of the reduction,
+*>          FACTOPTNB is the blocking used by the QR or LQ
+*>          algorithm, usually FACTOPTNB=128 is a good choice
+*>          NTHREADS is the number of threads used when
+*>          openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2016
+*
+*> \ingroup complexHEcomputational
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+     $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          VECT, UPLO
+      INTEGER            N, LDA, LWORK, LHOUS2, INFO
+*     ..
+*     .. Array Arguments ..
+      REAL               D( * ), E( * )
+      COMPLEX            A( LDA, * ), TAU( * ),
+     $                   HOUS2( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, UPPER, WANTQ
+      INTEGER            KD, IB, LWMIN, LHMIN, LWRK, LDAB, WPOS, ABPOS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, CHETRD_HE2HB, CHETRD_HB2ST
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO   = 0
+      WANTQ  = LSAME( VECT, 'V' )
+      UPPER  = LSAME( UPLO, 'U' )
+      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS2.EQ.-1 )
+*
+*     Determine the block size, the workspace size and the hous size.
+*
+      KD     = ILAENV( 17, 'CHETRD_2STAGE', VECT, N, -1, -1, -1 )
+      IB     = ILAENV( 18, 'CHETRD_2STAGE', VECT, N, KD, -1, -1 )
+      LHMIN  = ILAENV( 19, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
+      LWMIN  = ILAENV( 20, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
+*      WRITE(*,*),'CHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
+*     $            LHMIN, LWMIN
+*
+      IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      ELSE IF( LHOUS2.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -12
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         HOUS2( 1 ) = LHMIN
+         WORK( 1 )  = LWMIN
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHETRD_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 ) THEN
+         WORK( 1 ) = 1
+         RETURN
+      END IF
+*
+*     Determine pointer position
+*
+      LDAB  = KD+1
+      LWRK  = LWORK-LDAB*N
+      ABPOS = 1
+      WPOS  = ABPOS + LDAB*N
+      CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+     $                   TAU, WORK( WPOS ), LWRK, INFO )
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHETRD_HE2HB', -INFO )
+         RETURN
+      END IF
+      CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, 
+     $                   WORK( ABPOS ), LDAB, D, E, 
+     $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHETRD_HB2ST', -INFO )
+         RETURN
+      END IF
+*
+*
+      HOUS2( 1 ) = LHMIN
+      WORK( 1 )  = LWMIN
+      RETURN
+*
+*     End of CHETRD_2STAGE
+*
+      END
diff --git a/SRC/chetrd_hb2st.F b/SRC/chetrd_hb2st.F
new file mode 100644
index 00000000..6f253278
--- /dev/null
+++ b/SRC/chetrd_hb2st.F
@@ -0,0 +1,603 @@
+*> \brief \b CHBTRD
+*
+*  @generated from zhetrd_hb2st.F, fortran z -> c, Sun Nov  6 19:34:06 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CHBTRD + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbtrd.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbtrd.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbtrd.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+*       #define PRECISION_COMPLEX
+*
+*       #if defined(_OPENMP)
+*       use omp_lib
+*       #endif
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          STAGE1, UPLO, VECT
+*       INTEGER            N, KD, IB, LDAB, LHOUS, LWORK, INFO
+*       ..
+*       .. Array Arguments ..
+*       REAL               D( * ), E( * )
+*       COMPLEX            AB( LDAB, * ), HOUS( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CHBTRD reduces a complex Hermitian band matrix A to real symmetric
+*> tridiagonal form T by a unitary similarity transformation:
+*> Q**H * A * Q = T.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] STAGE
+*> \verbatim
+*>          STAGE is CHARACTER*1
+*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>                  from dense to band using the chetrd_he2hb routine
+*>                  was not called before this routine to reproduce AB. 
+*>                  In other term this routine is called as standalone. 
+*>          = 'Y':  "Yes": to mention that the stage 1 of the 
+*>                  reduction from dense to band using the chetrd_he2hb 
+*>                  routine has been called to produce AB (e.g., AB is
+*>                  the output of chetrd_he2hb.
+*> \endverbatim
+*>
+*> \param[in] VECT
+*> \verbatim
+*>          VECT is CHARACTER*1
+*>          = 'N':  No need for the Housholder representation, 
+*>                  and thus LHOUS is of size max(1, 4*N);
+*>          = 'V':  the Householder representation is needed to 
+*>                  either generate or to apply Q later on, 
+*>                  then LHOUS is to be queried and computed.
+*>                  (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX array, dimension (LDAB,N)
+*>          On entry, the upper or lower triangle of the Hermitian band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>          On exit, the diagonal elements of AB are overwritten by the
+*>          diagonal elements of the tridiagonal matrix T; if KD > 0, the
+*>          elements on the first superdiagonal (if UPLO = 'U') or the
+*>          first subdiagonal (if UPLO = 'L') are overwritten by the
+*>          off-diagonal elements of T; the rest of AB is overwritten by
+*>          values generated during the reduction.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is REAL array, dimension (N)
+*>          The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*>          E is REAL array, dimension (N-1)
+*>          The off-diagonal elements of the tridiagonal matrix T:
+*>          E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
+*> \endverbatim
+*>
+*> \param[out] HOUS
+*> \verbatim
+*>          HOUS is COMPLEX array, dimension LHOUS, that
+*>          store the Householder representation.
+*> \endverbatim
+*>
+*> \param[in] LHOUS
+*> \verbatim
+*>          LHOUS is INTEGER
+*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS=-1,
+*>          then a query is assumed; the routine
+*>          only calculates the optimal size of the HOUS array, returns
+*>          this value as the first entry of the HOUS array, and no error
+*>          message related to LHOUS is issued by XERBLA.
+*>          LHOUS = MAX(1, dimension) where
+*>          dimension = 4*N if VECT='N'
+*>          not available now if VECT='H'     
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS=-1,
+*>          then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>          LWORK = MAX(1, dimension) where
+*>          dimension   = (2KD+1)*N + KD*NTHREADS
+*>          where KD is the blocking size of the reduction,
+*>          FACTOPTNB is the blocking used by the QR or LQ
+*>          algorithm, usually FACTOPTNB=128 is a good choice
+*>          NTHREADS is the number of threads used when
+*>          openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complexOTHERcomputational
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+     $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+#define PRECISION_COMPLEX
+*
+#if defined(_OPENMP)
+      use omp_lib
+#endif
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          STAGE1, UPLO, VECT
+      INTEGER            N, KD, LDAB, LHOUS, LWORK, INFO
+*     ..
+*     .. Array Arguments ..
+      REAL               D( * ), E( * )
+      COMPLEX            AB( LDAB, * ), HOUS( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               RZERO
+      COMPLEX            ZERO, ONE
+      PARAMETER          ( RZERO = 0.0E+0,
+     $                   ZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   ONE  = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+     $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
+     $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
+     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
+     $                   SICEV, SIZETAU, LDV, LHMIN, LWMIN
+#if defined (PRECISION_COMPLEX)
+      REAL               ABSTMP
+      COMPLEX            TMP
+#endif
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CHB2ST_KERNELS, CLACPY, CLASET
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MIN, MAX, CEILING, REAL
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV 
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Determine the minimal workspace size required.
+*     Test the input parameters
+*
+      DEBUG   = 0
+      INFO    = 0
+      AFTERS1 = LSAME( STAGE1, 'Y' )
+      WANTQ   = LSAME( VECT, 'V' )
+      UPPER   = LSAME( UPLO, 'U' )
+      LQUERY  = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 )
+*
+*     Determine the block size, the workspace size and the hous size.
+*
+      IB     = ILAENV( 18, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
+      LHMIN  = ILAENV( 19, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      LWMIN  = ILAENV( 20, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+*
+      IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( LDAB.LT.(KD+1) ) THEN
+         INFO = -7
+      ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -11
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -13
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         HOUS( 1 ) = LHMIN
+         WORK( 1 ) = LWMIN
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHETRD_HB2ST', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 ) THEN
+         WORK( 1 ) = 1
+         RETURN
+      END IF
+*
+*     Determine pointer position
+*
+      LDV      = KD + IB
+      SIZETAU  = 2 * N
+      SICEV    = 2 * N
+      INDTAU   = 1
+      INDV     = INDTAU + SIZETAU
+      LDA      = 2 * KD + 1
+      SIZEA    = LDA * N
+      INDA     = 1
+      INDW     = INDA + SIZEA
+      NTHREADS = 1
+      TID      = 0
+*
+      IF( UPPER ) THEN
+          APOS     = INDA + KD
+          AWPOS    = INDA
+          DPOS     = APOS + KD
+          OFDPOS   = DPOS - 1
+          ABDPOS   = KD + 1
+          ABOFDPOS = KD
+      ELSE
+          APOS     = INDA 
+          AWPOS    = INDA + KD + 1
+          DPOS     = APOS
+          OFDPOS   = DPOS + 1
+          ABDPOS   = 1
+          ABOFDPOS = 2
+
+      ENDIF
+*      
+*     Case KD=0: 
+*     The matrix is diagonal. We just copy it (convert to "real" for 
+*     complex because D is double and the imaginary part should be 0) 
+*     and store it in D. A sequential code here is better or 
+*     in a parallel environment it might need two cores for D and E
+*
+      IF( KD.EQ.0 ) THEN
+          DO 30 I = 1, N
+              D( I ) = REAL( AB( ABDPOS, I ) )
+   30     CONTINUE
+          DO 40 I = 1, N-1
+              E( I ) = RZERO
+   40     CONTINUE
+         GOTO 200
+      END IF
+*      
+*     Case KD=1: 
+*     The matrix is already Tridiagonal. We have to make diagonal 
+*     and offdiagonal elements real, and store them in D and E.
+*     For that, for real precision just copy the diag and offdiag 
+*     to D and E while for the COMPLEX case the bulge chasing is  
+*     performed to convert the hermetian tridiagonal to symmetric 
+*     tridiagonal. A simpler coversion formula might be used, but then 
+*     updating the Q matrix will be required and based if Q is generated
+*     or not this might complicate the story. 
+*      
+C      IF( KD.EQ.1 .AND. N.GT.(KD+1) .AND. AFTERS1 ) THEN
+      IF( KD.EQ.1 ) THEN
+          DO 50 I = 1, N
+              D( I ) = REAL( AB( ABDPOS, I ) )
+   50     CONTINUE
+#if defined (PRECISION_COMPLEX)
+*
+*         make off-diagonal elements real and copy them to E
+*
+          IF( UPPER ) THEN
+              DO 60 I = 1, N - 1
+                  TMP = AB( ABOFDPOS, I+1 )
+                  ABSTMP = ABS( TMP )
+                  AB( ABOFDPOS, I+1 ) = ABSTMP
+                  E( I ) = ABSTMP
+                  IF( ABSTMP.NE.RZERO ) THEN
+                     TMP = TMP / ABSTMP
+                  ELSE
+                     TMP = ONE
+                  END IF
+                  IF( I.LT.N-1 )
+     $               AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP
+C                  IF( WANTZ ) THEN
+C                     CALL CSCAL( N, CONJG( TMP ), Q( 1, I+1 ), 1 )
+C                  END IF
+   60         CONTINUE
+          ELSE
+              DO 70 I = 1, N - 1
+                 TMP = AB( ABOFDPOS, I )
+                 ABSTMP = ABS( TMP )
+                 AB( ABOFDPOS, I ) = ABSTMP
+                 E( I ) = ABSTMP
+                 IF( ABSTMP.NE.RZERO ) THEN
+                    TMP = TMP / ABSTMP
+                 ELSE
+                    TMP = ONE
+                 END IF
+                 IF( I.LT.N-1 )
+     $              AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP
+C                 IF( WANTQ ) THEN
+C                    CALL CSCAL( N, TMP, Q( 1, I+1 ), 1 )
+C                 END IF
+   70         CONTINUE
+          ENDIF
+#else
+          IF( UPPER ) THEN
+              DO 60 I = 1, N-1
+                 E( I ) = REAL( AB( ABOFDPOS, I+1 ) )
+   60         CONTINUE
+          ELSE
+              DO 70 I = 1, N-1
+                 E( I ) = REAL( AB( ABOFDPOS, I ) )
+   70         CONTINUE
+          ENDIF
+#endif
+          GOTO 200
+      END IF
+*
+*     Main code start here. 
+*     Reduce the hermitian band of A to a tridiagonal matrix.
+*
+      THGRSIZ   = N
+      GRSIZ     = 1
+      SHIFT     = 3
+      NBTILES   = CEILING( REAL(N)/REAL(KD) )
+      STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
+      THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
+*      
+      CALL CLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
+      CALL CLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
+*
+*
+*     openMP parallelisation start here
+*
+#if defined(_OPENMP)
+!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
+!$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
+!$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
+!$OMP$         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+!$OMP MASTER
+#endif
+*
+*     main bulge chasing loop
+*      
+      DO 100 THGRID = 1, THGRNB
+          STT  = (THGRID-1)*THGRSIZ+1
+          THED = MIN( (STT + THGRSIZ -1), (N-1))
+          DO 110 I = STT, N-1
+              ED = MIN( I, THED )
+              IF( STT.GT.ED ) GOTO 100
+              DO 120 M = 1, STEPERCOL
+                  ST = STT
+                  DO 130 SWEEPID = ST, ED
+                      DO 140 K = 1, GRSIZ
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+     $                           + (M-1)*GRSIZ + K
+                          IF ( MYID.EQ.1 ) THEN
+                              TTYPE = 1
+                          ELSE
+                              TTYPE = MOD( MYID, 2 ) + 2
+                          ENDIF
+
+                          IF( TTYPE.EQ.2 ) THEN
+                              COLPT      = (MYID/2)*KD + SWEEPID
+                              STIND      = COLPT-KD+1
+                              EDIND      = MIN(COLPT,N)
+                              BLKLASTIND = COLPT
+                          ELSE
+                              COLPT      = ((MYID+1)/2)*KD + SWEEPID
+                              STIND      = COLPT-KD+1
+                              EDIND      = MIN(COLPT,N)
+                              IF( ( STIND.GE.EDIND-1 ).AND.
+     $                            ( EDIND.EQ.N ) ) THEN
+                                  BLKLASTIND = N
+                              ELSE
+                                  BLKLASTIND = 0
+                              ENDIF
+                          ENDIF
+*
+*                         Call the kernel
+*                             
+#if defined(_OPENMP)
+                          IF( TTYPE.NE.1 ) THEN      
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$     DEPEND(in:WORK(MYID-1))
+!$OMP$     DEPEND(out:WORK(MYID))
+                              TID      = OMP_GET_THREAD_NUM()
+                              CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+     $                             STIND, EDIND, SWEEPID, N, KD, IB,
+     $                             WORK ( INDA ), LDA, 
+     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
+     $                             WORK( INDW + TID*KD ) )
+!$OMP END TASK
+                          ELSE
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$     DEPEND(out:WORK(MYID))
+                              TID      = OMP_GET_THREAD_NUM()
+                              CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+     $                             STIND, EDIND, SWEEPID, N, KD, IB,
+     $                             WORK ( INDA ), LDA, 
+     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
+     $                             WORK( INDW + TID*KD ) )
+!$OMP END TASK
+                          ENDIF
+#else
+                          CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+     $                         STIND, EDIND, SWEEPID, N, KD, IB,
+     $                         WORK ( INDA ), LDA, 
+     $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
+     $                         WORK( INDW + TID*KD ) )
+#endif 
+                          IF ( BLKLASTIND.GE.(N-1) ) THEN
+                              STT = STT + 1
+                              GOTO 130
+                          ENDIF
+  140                 CONTINUE
+  130             CONTINUE
+  120         CONTINUE
+  110     CONTINUE
+  100 CONTINUE
+*
+#if defined(_OPENMP)
+!$OMP END MASTER
+!$OMP END PARALLEL
+#endif
+*      
+*     Copy the diagonal from A to D. Note that D is REAL thus only
+*     the Real part is needed, the imaginary part should be zero.
+*
+      DO 150 I = 1, N
+          D( I ) = REAL( WORK( DPOS+(I-1)*LDA ) )
+  150 CONTINUE
+*      
+*     Copy the off diagonal from A to E. Note that E is REAL thus only
+*     the Real part is needed, the imaginary part should be zero.
+*
+      IF( UPPER ) THEN
+          DO 160 I = 1, N-1
+             E( I ) = REAL( WORK( OFDPOS+I*LDA ) )
+  160     CONTINUE
+      ELSE
+          DO 170 I = 1, N-1
+             E( I ) = REAL( WORK( OFDPOS+(I-1)*LDA ) )
+  170     CONTINUE
+      ENDIF
+*
+  200 CONTINUE  
+*
+      HOUS( 1 ) = LHMIN
+      WORK( 1 ) = LWMIN
+      RETURN
+*
+*     End of CHETRD_HB2ST
+*
+      END
+#undef PRECISION_COMPLEX
+      
diff --git a/SRC/chetrd_he2hb.f b/SRC/chetrd_he2hb.f
new file mode 100644
index 00000000..28f5dc60
--- /dev/null
+++ b/SRC/chetrd_he2hb.f
@@ -0,0 +1,517 @@
+*> \brief \b CHETRD_HE2HB
+*
+*  @generated from zhetrd_he2hb.f, fortran z -> c, Sun Nov  6 19:34:06 2016
+*      
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download CHETRD_HE2HB + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CHETRD_HE2HB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, 
+*                              WORK, LWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          UPLO
+*       INTEGER            INFO, LDA, LDAB, LWORK, N, KD
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            A( LDA, * ), AB( LDAB, * ), 
+*                          TAU( * ), WORK( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CHETRD_HE2HB reduces a complex Hermitian matrix A to complex Hermitian
+*> band-diagonal form AB by a unitary similarity transformation:
+*> Q**H * A * Q = AB.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the reduced matrix if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*>          The reduced matrix is stored in the array AB.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+*>          N-by-N upper triangular part of A contains the upper
+*>          triangular part of the matrix A, and the strictly lower
+*>          triangular part of A is not referenced.  If UPLO = 'L', the
+*>          leading N-by-N lower triangular part of A contains the lower
+*>          triangular part of the matrix A, and the strictly upper
+*>          triangular part of A is not referenced.
+*>          On exit, if UPLO = 'U', the diagonal and first superdiagonal
+*>          of A are overwritten by the corresponding elements of the
+*>          tridiagonal matrix T, and the elements above the first
+*>          superdiagonal, with the array TAU, represent the unitary
+*>          matrix Q as a product of elementary reflectors; if UPLO
+*>          = 'L', the diagonal and first subdiagonal of A are over-
+*>          written by the corresponding elements of the tridiagonal
+*>          matrix T, and the elements below the first subdiagonal, with
+*>          the array TAU, represent the unitary matrix Q as a product
+*>          of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] AB
+*> \verbatim
+*>          AB is COMPLEX array, dimension (LDAB,N)
+*>          On exit, the upper or lower triangle of the Hermitian band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (N-KD)
+*>          The scalar factors of the elementary reflectors (see Further
+*>          Details).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension LWORK.
+*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK(1) returns the size of LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK which should be calculated
+*           by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>          LWORK_QUERY = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD
+*>          where FACTOPTNB is the blocking used by the QR or LQ
+*>          algorithm, usually FACTOPTNB=128 is a good choice otherwise
+*>          putting LWORK=-1 will provide the size of WORK.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2016
+*
+*> \ingroup complexHEcomputational
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*> \verbatim
+*>
+*>  If UPLO = 'U', the matrix Q is represented as a product of elementary
+*>  reflectors
+*>
+*>     Q = H(k)**H . . . H(2)**H H(1)**H, where k = n-kd.
+*>
+*>  Each H(i) has the form
+*>
+*>     H(i) = I - tau * v * v**H
+*>
+*>  where tau is a complex scalar, and v is a complex vector with
+*>  v(1:i+kd-1) = 0 and v(i+kd) = 1; conjg(v(i+kd+1:n)) is stored on exit in
+*>  A(i,i+kd+1:n), and tau in TAU(i).
+*>
+*>  If UPLO = 'L', the matrix Q is represented as a product of elementary
+*>  reflectors
+*>
+*>     Q = H(1) H(2) . . . H(k), where k = n-kd.
+*>
+*>  Each H(i) has the form
+*>
+*>     H(i) = I - tau * v * v**H
+*>
+*>  where tau is a complex scalar, and v is a complex vector with
+*>  v(kd+1:i) = 0 and v(i+kd+1) = 1; v(i+kd+2:n) is stored on exit in
+*   A(i+kd+2:n,i), and tau in TAU(i).
+*>
+*>  The contents of A on exit are illustrated by the following examples
+*>  with n = 5:
+*>
+*>  if UPLO = 'U':                       if UPLO = 'L':
+*>
+*>    (  ab  ab/v1  v1      v1     v1    )              (  ab                            )
+*>    (      ab     ab/v2   v2     v2    )              (  ab/v1  ab                     )
+*>    (             ab      ab/v3  v3    )              (  v1     ab/v2  ab              )
+*>    (                     ab     ab/v4 )              (  v1     v2     ab/v3  ab       )
+*>    (                            ab    )              (  v1     v2     v3     ab/v4 ab )
+*>
+*>  where d and e denote diagonal and off-diagonal elements of T, and vi
+*>  denotes an element of the vector defining H(i).
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CHETRD_HE2HB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, 
+     $                         WORK, LWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          UPLO
+      INTEGER            INFO, LDA, LDAB, LWORK, N, KD
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), AB( LDAB, * ), 
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               RONE
+      COMPLEX            ZERO, ONE, HALF
+      PARAMETER          ( RONE = 1.0E+0,
+     $                   ZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   ONE = ( 1.0E+0, 0.0E+0 ),
+     $                   HALF = ( 0.5E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, UPPER
+      INTEGER            I, J, IINFO, LWMIN, PN, PK, LK,
+     $                   LDT, LDW, LDS2, LDS1, 
+     $                   LS2, LS1, LW, LT,
+     $                   TPOS, WPOS, S2POS, S1POS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, CHER2K, CHEMM, CGEMM,
+     $                   CLARFT, CGELQF, CGEQRF, CLASET
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MIN, MAX
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV 
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Determine the minimal workspace size required 
+*     and test the input parameters
+*
+      INFO   = 0
+      UPPER  = LSAME( UPLO, 'U' )
+      LQUERY = ( LWORK.EQ.-1 )
+      LWMIN  = ILAENV( 20, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
+      
+      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      ELSE IF( LDAB.LT.MAX( 1, KD+1 ) ) THEN
+         INFO = -7
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CHETRD_HE2HB', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         WORK( 1 ) = LWMIN
+         RETURN
+      END IF
+*
+*     Quick return if possible        
+*     Copy the upper/lower portion of A into AB 
+*
+      IF( N.LE.KD+1 ) THEN
+          IF( UPPER ) THEN
+              DO 100 I = 1, N
+                  LK = MIN( KD+1, I )
+                  CALL CCOPY( LK, A( I-LK+1, I ), 1, 
+     $                            AB( KD+1-LK+1, I ), 1 )
+  100         CONTINUE
+          ELSE
+              DO 110 I = 1, N
+                  LK = MIN( KD+1, N-I+1 )
+                  CALL CCOPY( LK, A( I, I ), 1, AB( 1, I ), 1 )
+  110         CONTINUE
+          ENDIF
+          WORK( 1 ) = 1
+          RETURN
+      END IF
+*
+*     Determine the pointer position for the workspace
+*      
+      LDT    = KD
+      LDS1   = KD
+      LT     = LDT*KD
+      LW     = N*KD
+      LS1    = LDS1*KD
+      LS2    = LWMIN - LT - LW - LS1
+*      LS2 = N*MAX(KD,FACTOPTNB) 
+      TPOS   = 1
+      WPOS   = TPOS  + LT
+      S1POS  = WPOS  + LW
+      S2POS  = S1POS + LS1 
+      IF( UPPER ) THEN
+          LDW    = KD
+          LDS2   = KD
+      ELSE
+          LDW    = N
+          LDS2   = N
+      ENDIF
+*
+*
+*     Set the workspace of the triangular matrix T to zero once such a
+*     way everytime T is generated the upper/lower portion will be always zero  
+*   
+      CALL CLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT )
+*
+      IF( UPPER ) THEN
+          DO 10 I = 1, N - KD, KD
+             PN = N-I-KD+1
+             PK = MIN( N-I-KD+1, KD )
+*        
+*            Compute the LQ factorization of the current block
+*        
+             CALL CGELQF( KD, PN, A( I, I+KD ), LDA,
+     $                    TAU( I ), WORK( S2POS ), LS2, IINFO )
+*        
+*            Copy the upper portion of A into AB
+*        
+             DO 20 J = I, I+PK-1
+                LK = MIN( KD, N-J ) + 1
+                CALL CCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+   20        CONTINUE
+*                
+             CALL CLASET( 'Lower', PK, PK, ZERO, ONE, 
+     $                    A( I, I+KD ), LDA )
+*        
+*            Form the matrix T
+*        
+             CALL CLARFT( 'Forward', 'Rowwise', PN, PK,
+     $                    A( I, I+KD ), LDA, TAU( I ), 
+     $                    WORK( TPOS ), LDT )
+*        
+*            Compute W:
+*             
+             CALL CGEMM( 'Conjugate', 'No transpose', PK, PN, PK,
+     $                   ONE,  WORK( TPOS ), LDT,
+     $                         A( I, I+KD ), LDA,
+     $                   ZERO, WORK( S2POS ), LDS2 )
+*        
+             CALL CHEMM( 'Right', UPLO, PK, PN,
+     $                   ONE,  A( I+KD, I+KD ), LDA,
+     $                         WORK( S2POS ), LDS2,
+     $                   ZERO, WORK( WPOS ), LDW )
+*        
+             CALL CGEMM( 'No transpose', 'Conjugate', PK, PK, PN,
+     $                   ONE,  WORK( WPOS ), LDW,
+     $                         WORK( S2POS ), LDS2,
+     $                   ZERO, WORK( S1POS ), LDS1 )
+*        
+             CALL CGEMM( 'No transpose', 'No transpose', PK, PN, PK,
+     $                   -HALF, WORK( S1POS ), LDS1, 
+     $                          A( I, I+KD ), LDA,
+     $                   ONE,   WORK( WPOS ), LDW )
+*             
+*        
+*            Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+*            an update of the form:  A := A - V'*W - W'*V
+*        
+             CALL CHER2K( UPLO, 'Conjugate', PN, PK,
+     $                    -ONE, A( I, I+KD ), LDA,
+     $                          WORK( WPOS ), LDW,
+     $                    RONE, A( I+KD, I+KD ), LDA )
+   10     CONTINUE
+*
+*        Copy the upper band to AB which is the band storage matrix
+*
+         DO 30 J = N-KD+1, N
+            LK = MIN(KD, N-J) + 1
+            CALL CCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+   30    CONTINUE
+*
+      ELSE
+*
+*         Reduce the lower triangle of A to lower band matrix
+*        
+          DO 40 I = 1, N - KD, KD
+             PN = N-I-KD+1
+             PK = MIN( N-I-KD+1, KD )
+*        
+*            Compute the QR factorization of the current block
+*        
+             CALL CGEQRF( PN, KD, A( I+KD, I ), LDA,
+     $                    TAU( I ), WORK( S2POS ), LS2, IINFO )
+*        
+*            Copy the upper portion of A into AB 
+*        
+             DO 50 J = I, I+PK-1
+                LK = MIN( KD, N-J ) + 1
+                CALL CCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+   50        CONTINUE
+*                
+             CALL CLASET( 'Upper', PK, PK, ZERO, ONE, 
+     $                    A( I+KD, I ), LDA )
+*        
+*            Form the matrix T
+*        
+             CALL CLARFT( 'Forward', 'Columnwise', PN, PK,
+     $                    A( I+KD, I ), LDA, TAU( I ), 
+     $                    WORK( TPOS ), LDT )
+*        
+*            Compute W:
+*             
+             CALL CGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+     $                   ONE, A( I+KD, I ), LDA,
+     $                         WORK( TPOS ), LDT,
+     $                   ZERO, WORK( S2POS ), LDS2 )
+*        
+             CALL CHEMM( 'Left', UPLO, PN, PK,
+     $                   ONE, A( I+KD, I+KD ), LDA,
+     $                         WORK( S2POS ), LDS2,
+     $                   ZERO, WORK( WPOS ), LDW )
+*        
+             CALL CGEMM( 'Conjugate', 'No transpose', PK, PK, PN,
+     $                   ONE, WORK( S2POS ), LDS2,
+     $                         WORK( WPOS ), LDW,
+     $                   ZERO, WORK( S1POS ), LDS1 )
+*        
+             CALL CGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+     $                   -HALF, A( I+KD, I ), LDA,
+     $                         WORK( S1POS ), LDS1,
+     $                   ONE, WORK( WPOS ), LDW )
+*             
+*        
+*            Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+*            an update of the form:  A := A - V*W' - W*V'
+*        
+             CALL CHER2K( UPLO, 'No transpose', PN, PK,
+     $                    -ONE, A( I+KD, I ), LDA,
+     $                           WORK( WPOS ), LDW,
+     $                    RONE, A( I+KD, I+KD ), LDA )
+*            ==================================================================
+*            RESTORE A FOR COMPARISON AND CHECKING TO BE REMOVED
+*             DO 45 J = I, I+PK-1
+*                LK = MIN( KD, N-J ) + 1
+*                CALL CCOPY( LK, AB( 1, J ), 1, A( J, J ), 1 )
+*   45        CONTINUE
+*            ==================================================================
+   40     CONTINUE
+*
+*        Copy the lower band to AB which is the band storage matrix
+*
+         DO 60 J = N-KD+1, N
+            LK = MIN(KD, N-J) + 1
+            CALL CCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+   60    CONTINUE
+
+      END IF
+*
+      WORK( 1 ) = LWMIN
+      RETURN
+*
+*     End of CHETRD_HE2HB
+*
+      END
diff --git a/SRC/clarfy.f b/SRC/clarfy.f
new file mode 100644
index 00000000..572a4723
--- /dev/null
+++ b/SRC/clarfy.f
@@ -0,0 +1,163 @@
+*> \brief \b CLARFY
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          UPLO
+*       INTEGER            INCV, LDC, N
+*       COMPLEX            TAU
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            C( LDC, * ), V( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLARFY applies an elementary reflector, or Householder matrix, H,
+*> to an n x n Hermitian matrix C, from both the left and the right.
+*>
+*> H is represented in the form
+*>
+*>    H = I - tau * v * v'
+*>
+*> where  tau  is a scalar and  v  is a vector.
+*>
+*> If  tau  is  zero, then  H  is taken to be the unit matrix.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          Specifies whether the upper or lower triangular part of the
+*>          Hermitian matrix C is stored.
+*>          = 'U':  Upper triangle
+*>          = 'L':  Lower triangle
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of rows and columns of the matrix C.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is COMPLEX array, dimension
+*>                  (1 + (N-1)*abs(INCV))
+*>          The vector v as described above.
+*> \endverbatim
+*>
+*> \param[in] INCV
+*> \verbatim
+*>          INCV is INTEGER
+*>          The increment between successive elements of v.  INCV must
+*>          not be zero.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*>          TAU is COMPLEX
+*>          The value tau as described above.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX array, dimension (LDC, N)
+*>          On entry, the matrix C.
+*>          On exit, C is overwritten by H * C * H'.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C.  LDC >= max( 1, N ).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (N)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2011
+*
+*> \ingroup complex_eig
+*
+*  =====================================================================
+      SUBROUTINE CLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+*  -- LAPACK test routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          UPLO
+      INTEGER            INCV, LDC, N
+      COMPLEX            TAU
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            C( LDC, * ), V( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            ONE, ZERO, HALF
+      PARAMETER          ( ONE = ( 1.0E+0, 0.0E+0 ),
+     $                   ZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   HALF = ( 0.5E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      COMPLEX            ALPHA
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CAXPY, CHEMV, CHER2
+*     ..
+*     .. External Functions ..
+      COMPLEX            CDOTC
+      EXTERNAL           CDOTC
+*     ..
+*     .. Executable Statements ..
+*
+      IF( TAU.EQ.ZERO )
+     $   RETURN
+*
+*     Form  w:= C * v
+*
+      CALL CHEMV( UPLO, N, ONE, C, LDC, V, INCV, ZERO, WORK, 1 )
+*
+      ALPHA = -HALF*TAU*CDOTC( N, WORK, 1, V, INCV )
+      CALL CAXPY( N, ALPHA, V, INCV, WORK, 1 )
+*
+*     C := C - v * w' - w * v'
+*
+      CALL CHER2( UPLO, N, -TAU, V, INCV, WORK, 1, C, LDC )
+*
+      RETURN
+*
+*     End of CLARFY
+*
+      END
diff --git a/SRC/dlarfy.f b/SRC/dlarfy.f
new file mode 100644
index 00000000..089aa94e
--- /dev/null
+++ b/SRC/dlarfy.f
@@ -0,0 +1,161 @@
+*> \brief \b DLARFY
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          UPLO
+*       INTEGER            INCV, LDC, N
+*       DOUBLE PRECISION   TAU
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   C( LDC, * ), V( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLARFY applies an elementary reflector, or Householder matrix, H,
+*> to an n x n symmetric matrix C, from both the left and the right.
+*>
+*> H is represented in the form
+*>
+*>    H = I - tau * v * v'
+*>
+*> where  tau  is a scalar and  v  is a vector.
+*>
+*> If  tau  is  zero, then  H  is taken to be the unit matrix.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          Specifies whether the upper or lower triangular part of the
+*>          symmetric matrix C is stored.
+*>          = 'U':  Upper triangle
+*>          = 'L':  Lower triangle
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of rows and columns of the matrix C.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is DOUBLE PRECISION array, dimension
+*>                  (1 + (N-1)*abs(INCV))
+*>          The vector v as described above.
+*> \endverbatim
+*>
+*> \param[in] INCV
+*> \verbatim
+*>          INCV is INTEGER
+*>          The increment between successive elements of v.  INCV must
+*>          not be zero.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION
+*>          The value tau as described above.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (LDC, N)
+*>          On entry, the matrix C.
+*>          On exit, C is overwritten by H * C * H'.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C.  LDC >= max( 1, N ).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (N)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2011
+*
+*> \ingroup double_eig
+*
+*  =====================================================================
+      SUBROUTINE DLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+*  -- LAPACK test routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          UPLO
+      INTEGER            INCV, LDC, N
+      DOUBLE PRECISION   TAU
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   C( LDC, * ), V( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO, HALF
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0, HALF = 0.5D+0 )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DAXPY, DSYMV, DSYR2
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DDOT
+      EXTERNAL           DDOT
+*     ..
+*     .. Executable Statements ..
+*
+      IF( TAU.EQ.ZERO )
+     $   RETURN
+*
+*     Form  w:= C * v
+*
+      CALL DSYMV( UPLO, N, ONE, C, LDC, V, INCV, ZERO, WORK, 1 )
+*
+      ALPHA = -HALF*TAU*DDOT( N, WORK, 1, V, INCV )
+      CALL DAXPY( N, ALPHA, V, INCV, WORK, 1 )
+*
+*     C := C - v * w' - w * v'
+*
+      CALL DSYR2( UPLO, N, -TAU, V, INCV, WORK, 1, C, LDC )
+*
+      RETURN
+*
+*     End of DLARFY
+*
+      END
diff --git a/SRC/dsb2st_kernels.f b/SRC/dsb2st_kernels.f
new file mode 100644
index 00000000..15d1186e
--- /dev/null
+++ b/SRC/dsb2st_kernels.f
@@ -0,0 +1,320 @@
+*> \brief \b DSB2ST_KERNELS
+*
+*  @generated from zhb2st_kernels.f, fortran z -> d, Sun Nov  6 19:34:06 2016
+*      
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download DSB2ST_KERNELS + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsb2st_kernels.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsb2st_kernels.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsb2st_kernels.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE  DSB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+*                                   ST, ED, SWEEP, N, NB, IB,
+*                                   A, LDA, V, TAU, LDVT, WORK)
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          UPLO
+*       LOGICAL            WANTZ
+*       INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * ), V( * ), 
+*                          TAU( * ), WORK( * )
+*  
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSB2ST_KERNELS is an internal routine used by the DSYTRD_SB2ST
+*> subroutine.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> @param[in] n
+*>          The order of the matrix A.
+*>
+*> @param[in] nb
+*>          The size of the band.
+*>
+*> @param[in, out] A
+*>          A pointer to the matrix A.
+*>
+*> @param[in] lda
+*>          The leading dimension of the matrix A.
+*>
+*> @param[out] V
+*>          DOUBLE PRECISION array, dimension 2*n if eigenvalues only are
+*>          requested or to be queried for vectors.
+*>
+*> @param[out] TAU
+*>          DOUBLE PRECISION array, dimension (2*n).
+*>          The scalar factors of the Householder reflectors are stored
+*>          in this array.
+*>
+*> @param[in] st
+*>          internal parameter for indices.
+*>
+*> @param[in] ed
+*>          internal parameter for indices.
+*>
+*> @param[in] sweep
+*>          internal parameter for indices.
+*>
+*> @param[in] Vblksiz
+*>          internal parameter for indices.
+*>
+*> @param[in] wantz
+*>          logical which indicate if Eigenvalue are requested or both
+*>          Eigenvalue/Eigenvectors.
+*>
+*> @param[in] work
+*>          Workspace of size nb.
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE  DSB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+     $                            ST, ED, SWEEP, N, NB, IB,
+     $                            A, LDA, V, TAU, LDVT, WORK)
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          UPLO
+      LOGICAL            WANTZ
+      INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), V( * ), 
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0,
+     $                   ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, J1, J2, LM, LN, VPOS, TAUPOS,
+     $                   DPOS, OFDPOS, AJETER 
+      DOUBLE PRECISION   CTMP 
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLARFG, DLARFX, DLARFY
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MOD
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     ..
+*     .. Executable Statements ..
+*      
+      AJETER = IB + LDVT
+      UPPER = LSAME( UPLO, 'U' )
+
+      IF( UPPER ) THEN
+          DPOS    = 2 * NB + 1
+          OFDPOS  = 2 * NB
+      ELSE
+          DPOS    = 1
+          OFDPOS  = 2
+      ENDIF
+
+*
+*     Upper case
+*  
+      IF( UPPER ) THEN
+*      
+          IF( WANTZ ) THEN
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ELSE
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ENDIF
+          GO TO ( 101, 102, 103 ) TTYPE
+*
+  101     CONTINUE
+          LM = ED - ST + 1
+*
+          V( VPOS ) = ONE
+          DO 10 I = 1, LM-1
+              V( VPOS+I )         = ( A( OFDPOS-I, ST+I ) )
+              A( OFDPOS-I, ST+I ) = ZERO  
+   10     CONTINUE
+          CTMP = ( A( OFDPOS, ST ) )
+          CALL DLARFG( LM, CTMP, V( VPOS+1 ), 1, 
+     $                                   TAU( TAUPOS ) )
+          A( OFDPOS, ST ) = CTMP
+* 
+  103     CONTINUE
+          LM = ED - ST + 1
+          CALL DLARFY( UPLO, LM, V( VPOS ), 1, ( TAU( TAUPOS ) ),
+     $                             A( DPOS, ST ), LDA-1, WORK)
+          GOTO 300
+*
+  102     CONTINUE
+          J1 = ED+1
+          J2 = MIN( ED+NB, N )
+          LN = ED-ST+1
+          LM = J2-J1+1
+          IF( LM.GT.0) THEN
+              CALL DLARFX( 'Left', LN, LM, V( VPOS ),
+     $                     ( TAU( TAUPOS ) ), A( DPOS-NB, J1 ),
+     $                     LDA-1, WORK)
+*
+              IF( WANTZ ) THEN
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ELSE
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ENDIF
+*
+              V( VPOS ) = ONE
+              DO 30 I = 1, LM-1
+                  V( VPOS+I )          = ( A( DPOS-NB-I, J1+I ) )
+                  A( DPOS-NB-I, J1+I ) = ZERO
+   30         CONTINUE
+              CTMP = ( A( DPOS-NB, J1 ) )
+              CALL DLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) )
+              A( DPOS-NB, J1 ) = CTMP
+*             
+              CALL DLARFX( 'Right', LN-1, LM, V( VPOS ), 
+     $                     TAU( TAUPOS ),
+     $                     A( DPOS-NB+1, J1 ), LDA-1, WORK)
+          ENDIF
+          GOTO 300
+*
+*     Lower case
+*  
+      ELSE
+*      
+          IF( WANTZ ) THEN
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ELSE
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ENDIF
+          GO TO ( 201, 202, 203 ) TTYPE
+*  
+  201     CONTINUE
+          LM = ED - ST + 1
+*
+          V( VPOS ) = ONE
+          DO 20 I = 1, LM-1
+              V( VPOS+I )         = A( OFDPOS+I, ST-1 )
+              A( OFDPOS+I, ST-1 ) = ZERO  
+   20     CONTINUE
+          CALL DLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, 
+     $                                   TAU( TAUPOS ) )
+*
+  203     CONTINUE
+          LM = ED - ST + 1
+*
+          CALL DLARFY( UPLO, LM, V( VPOS ), 1, ( TAU( TAUPOS ) ),
+     $                                      A( DPOS, ST ), LDA-1, WORK)
+
+          GOTO 300
+*
+  202     CONTINUE
+          J1 = ED+1
+          J2 = MIN( ED+NB, N )
+          LN = ED-ST+1
+          LM = J2-J1+1
+*
+          IF( LM.GT.0) THEN
+              CALL DLARFX( 'Right', LM, LN, V( VPOS ), 
+     $                     TAU( TAUPOS ), A( DPOS+NB, ST ),
+     $                     LDA-1, WORK)
+*
+              IF( WANTZ ) THEN
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ELSE
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ENDIF
+*              
+              V( VPOS ) = ONE
+              DO 40 I = 1, LM-1
+                  V( VPOS+I )        = A( DPOS+NB+I, ST )
+                  A( DPOS+NB+I, ST ) = ZERO
+   40         CONTINUE
+              CALL DLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, 
+     $                                    TAU( TAUPOS ) )
+*                  
+              CALL DLARFX( 'Left', LM, LN-1, V( VPOS ), 
+     $                     ( TAU( TAUPOS ) ),
+     $                     A( DPOS+NB-1, ST+1 ), LDA-1, WORK)
+
+          ENDIF
+          GOTO 300
+      ENDIF
+
+  300 CONTINUE    
+      RETURN
+*
+*     END OF DSB2ST_KERNELS
+*
+      END      
diff --git a/SRC/dsbev_2stage.f b/SRC/dsbev_2stage.f
new file mode 100644
index 00000000..771d29e0
--- /dev/null
+++ b/SRC/dsbev_2stage.f
@@ -0,0 +1,377 @@
+*> \brief <b> DSBEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+*  @precisions fortran d -> s
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSBEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsbev_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsbev_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsbev_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DSBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+*                                WORK, LWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, KD, LDAB, LDZ, N, LWORK
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSBEV_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a real symmetric band matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is DOUBLE PRECISION array, dimension (LDAB, N)
+*>          On entry, the upper or lower triangle of the symmetric band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*>          On exit, AB is overwritten by values generated during the
+*>          reduction to tridiagonal form.  If UPLO = 'U', the first
+*>          superdiagonal and the diagonal of the tridiagonal matrix T
+*>          are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*>          the diagonal and first subdiagonal of T are returned in the
+*>          first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is DOUBLE PRECISION array, dimension (LDZ, N)
+*>          If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*>          eigenvectors of the matrix A, with the i-th column of Z
+*>          holding the eigenvector associated with W(i).
+*>          If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension LWORK
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = (2KD+1)*N + KD*NTHREADS + N
+*>                                   where KD is the size of the band.
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, the algorithm failed to converge; i
+*>                off-diagonal elements of an intermediate tridiagonal
+*>                form did not converge to zero.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup doubleOTHEReigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DSBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+     $                         WORK, LWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, KD, LDAB, LDZ, N, LWORK
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, WANTZ, LQUERY
+      INTEGER            IINFO, IMAX, INDE, INDWRK, ISCALE,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS
+      DOUBLE PRECISION   ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, DLANSB
+      EXTERNAL           LSAME, DLAMCH, DLANSB, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLASCL, DSCAL, DSTEQR, DSTERF, XERBLA
+     $                   DSYTRD_SB2ST 
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDAB.LT.KD+1 ) THEN
+         INFO = -6
+      ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+         INFO = -9
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            WORK( 1 ) = LWMIN
+         ELSE
+            IB    = ILAENV( 18, 'DSYTRD_SB2ST', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+            LWMIN = N + LHTRD + LWTRD
+            WORK( 1 )  = LWMIN
+         ENDIF
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -11
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSBEV_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            W( 1 ) = AB( 1, 1 )
+         ELSE
+            W( 1 ) = AB( KD+1, 1 )
+         END IF
+         IF( WANTZ )
+     $      Z( 1, 1 ) = ONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = DLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            CALL DLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         ELSE
+            CALL DLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         END IF
+      END IF
+*
+*     Call DSYTRD_SB2ST to reduce symmetric band matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDHOUS = INDE + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL DSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call DSTERF.  For eigenvectors, call SSTEQR.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL DSTERF( N, W, WORK( INDE ), INFO )
+      ELSE
+         CALL DSTEQR( JOBZ, N, W, WORK( INDE ), Z, LDZ, WORK( INDWRK ),
+     $                INFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = N
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of DSBEV_2STAGE
+*
+      END
diff --git a/SRC/dsbevd_2stage.f b/SRC/dsbevd_2stage.f
new file mode 100644
index 00000000..39074681
--- /dev/null
+++ b/SRC/dsbevd_2stage.f
@@ -0,0 +1,412 @@
+*> \brief <b> DSBEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+*  @precisions fortran d -> s
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSBEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsbevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsbevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsbevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DSBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+*                                 WORK, LWORK, IWORK, LIWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, KD, LDAB, LDZ, LIWORK, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IWORK( * )
+*       DOUBLE PRECISION   AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSBEVD_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a real symmetric band matrix A using the 2stage technique for
+*> the reduction to tridiagonal. If eigenvectors are desired, it uses
+*> a divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is DOUBLE PRECISION array, dimension (LDAB, N)
+*>          On entry, the upper or lower triangle of the symmetric band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*>          On exit, AB is overwritten by values generated during the
+*>          reduction to tridiagonal form.  If UPLO = 'U', the first
+*>          superdiagonal and the diagonal of the tridiagonal matrix T
+*>          are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*>          the diagonal and first subdiagonal of T are returned in the
+*>          first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is DOUBLE PRECISION array, dimension (LDZ, N)
+*>          If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*>          eigenvectors of the matrix A, with the i-th column of Z
+*>          holding the eigenvector associated with W(i).
+*>          If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension LWORK
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = (2KD+1)*N + KD*NTHREADS + N
+*>                                   where KD is the size of the band.
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK and IWORK
+*>          arrays, returns these values as the first entries of the WORK
+*>          and IWORK arrays, and no error message related to LWORK or
+*>          LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.
+*>          If JOBZ  = 'N' or N <= 1, LIWORK must be at least 1.
+*>          If JOBZ  = 'V' and N > 2, LIWORK must be at least 3 + 5*N.
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK and IWORK arrays, and no error message related to
+*>          LWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, the algorithm failed to converge; i
+*>                off-diagonal elements of an intermediate tridiagonal
+*>                form did not converge to zero.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup doubleOTHEReigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DSBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+     $                          WORK, LWORK, IWORK, LIWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, KD, LDAB, LDZ, LIWORK, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      DOUBLE PRECISION   AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, LQUERY, WANTZ
+      INTEGER            IINFO, INDE, INDWK2, INDWRK, ISCALE, LIWMIN,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS,
+     $                   LLWRK2
+      DOUBLE PRECISION   ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, DLANSB
+      EXTERNAL           LSAME, DLAMCH, DLANSB, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGEMM, DLACPY, DLASCL, DSCAL, DSTEDC,
+     $                   DSTERF, XERBLA, DSYTRD_SB2ST
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( N.LE.1 ) THEN
+         LIWMIN = 1
+         LWMIN = 1
+      ELSE
+         IB    = ILAENV( 18, 'DSYTRD_SB2ST', JOBZ, N, KD, -1, -1 )
+         LHTRD = ILAENV( 19, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+         LWTRD = ILAENV( 20, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+         IF( WANTZ ) THEN
+            LIWMIN = 3 + 5*N
+            LWMIN = 1 + 5*N + 2*N**2
+         ELSE
+            LIWMIN = 1
+            LWMIN = MAX( 2*N, N+LHTRD+LWTRD )
+         END IF
+      END IF
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDAB.LT.KD+1 ) THEN
+         INFO = -6
+      ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+         INFO = -9
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         WORK( 1 )  = LWMIN
+         IWORK( 1 ) = LIWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -13
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSBEVD_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         W( 1 ) = AB( 1, 1 )
+         IF( WANTZ )
+     $      Z( 1, 1 ) = ONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = DLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            CALL DLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         ELSE
+            CALL DLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         END IF
+      END IF
+*
+*     Call DSYTRD_SB2ST to reduce band symmetric matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDHOUS = INDE + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+      INDWK2  = INDWRK + N*N
+      LLWRK2  = LWORK - INDWK2 + 1
+*
+      CALL DSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call DSTERF.  For eigenvectors, call SSTEDC.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL DSTERF( N, W, WORK( INDE ), INFO )
+      ELSE
+         CALL DSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N,
+     $                WORK( INDWK2 ), LLWRK2, IWORK, LIWORK, INFO )
+         CALL DGEMM( 'N', 'N', N, N, N, ONE, Z, LDZ, WORK( INDWRK ), N,
+     $               ZERO, WORK( INDWK2 ), N )
+         CALL DLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 )
+     $   CALL DSCAL( N, ONE / SIGMA, W, 1 )
+*
+      WORK( 1 ) = LWMIN
+      IWORK( 1 ) = LIWMIN
+      RETURN
+*
+*     End of DSBEVD_2STAGE
+*
+      END
diff --git a/SRC/dsbevx_2stage.f b/SRC/dsbevx_2stage.f
new file mode 100644
index 00000000..3cb3f661
--- /dev/null
+++ b/SRC/dsbevx_2stage.f
@@ -0,0 +1,633 @@
+*> \brief <b> DSBEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+*  @precisions fortran d -> s
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSBEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsbevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsbevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsbevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DSBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, Q,
+*                                 LDQ, VL, VU, IL, IU, ABSTOL, M, W, Z,
+*                                 LDZ, WORK, LWORK, IWORK, IFAIL, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, RANGE, UPLO
+*       INTEGER            IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+*       DOUBLE PRECISION   ABSTOL, VL, VU
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IFAIL( * ), IWORK( * )
+*       DOUBLE PRECISION   AB( LDAB, * ), Q( LDQ, * ), W( * ), WORK( * ),
+*      $                   Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSBEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a real symmetric band matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors can
+*> be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*>          RANGE is CHARACTER*1
+*>          = 'A': all eigenvalues will be found;
+*>          = 'V': all eigenvalues in the half-open interval (VL,VU]
+*>                 will be found;
+*>          = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is DOUBLE PRECISION array, dimension (LDAB, N)
+*>          On entry, the upper or lower triangle of the symmetric band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*>          On exit, AB is overwritten by values generated during the
+*>          reduction to tridiagonal form.  If UPLO = 'U', the first
+*>          superdiagonal and the diagonal of the tridiagonal matrix T
+*>          are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*>          the diagonal and first subdiagonal of T are returned in the
+*>          first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] Q
+*> \verbatim
+*>          Q is DOUBLE PRECISION array, dimension (LDQ, N)
+*>          If JOBZ = 'V', the N-by-N orthogonal matrix used in the
+*>                         reduction to tridiagonal form.
+*>          If JOBZ = 'N', the array Q is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDQ
+*> \verbatim
+*>          LDQ is INTEGER
+*>          The leading dimension of the array Q.  If JOBZ = 'V', then
+*>          LDQ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*>          VL is DOUBLE PRECISION
+*>          If RANGE='V', the lower bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*>          VU is DOUBLE PRECISION
+*>          If RANGE='V', the upper bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*>          IL is INTEGER
+*>          If RANGE='I', the index of the
+*>          smallest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*>          IU is INTEGER
+*>          If RANGE='I', the index of the
+*>          largest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION
+*>          The absolute error tolerance for the eigenvalues.
+*>          An approximate eigenvalue is accepted as converged
+*>          when it is determined to lie in an interval [a,b]
+*>          of width less than or equal to
+*>
+*>                  ABSTOL + EPS *   max( |a|,|b| ) ,
+*>
+*>          where EPS is the machine precision.  If ABSTOL is less than
+*>          or equal to zero, then  EPS*|T|  will be used in its place,
+*>          where |T| is the 1-norm of the tridiagonal matrix obtained
+*>          by reducing AB to tridiagonal form.
+*>
+*>          Eigenvalues will be computed most accurately when ABSTOL is
+*>          set to twice the underflow threshold 2*DLAMCH('S'), not zero.
+*>          If this routine returns with INFO>0, indicating that some
+*>          eigenvectors did not converge, try setting ABSTOL to
+*>          2*DLAMCH('S').
+*>
+*>          See "Computing Small Singular Values of Bidiagonal Matrices
+*>          with Guaranteed High Relative Accuracy," by Demmel and
+*>          Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*>          M is INTEGER
+*>          The total number of eigenvalues found.  0 <= M <= N.
+*>          If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          The first M elements contain the selected eigenvalues in
+*>          ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is DOUBLE PRECISION array, dimension (LDZ, max(1,M))
+*>          If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*>          contain the orthonormal eigenvectors of the matrix A
+*>          corresponding to the selected eigenvalues, with the i-th
+*>          column of Z holding the eigenvector associated with W(i).
+*>          If an eigenvector fails to converge, then that column of Z
+*>          contains the latest approximation to the eigenvector, and the
+*>          index of the eigenvector is returned in IFAIL.
+*>          If JOBZ = 'N', then Z is not referenced.
+*>          Note: the user must ensure that at least max(1,M) columns are
+*>          supplied in the array Z; if RANGE = 'V', the exact value of M
+*>          is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, 7*N, dimension) where
+*>                                   dimension = (2KD+1)*N + KD*NTHREADS + 2*N
+*>                                   where KD is the size of the band.
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*>          IFAIL is INTEGER array, dimension (N)
+*>          If JOBZ = 'V', then if INFO = 0, the first M elements of
+*>          IFAIL are zero.  If INFO > 0, then IFAIL contains the
+*>          indices of the eigenvectors that failed to converge.
+*>          If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  if INFO = i, then i eigenvectors failed to converge.
+*>                Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup doubleOTHEReigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DSBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, Q,
+     $                          LDQ, VL, VU, IL, IU, ABSTOL, M, W, Z,
+     $                          LDZ, WORK, LWORK, IWORK, IFAIL, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, RANGE, UPLO
+      INTEGER            IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+      DOUBLE PRECISION   ABSTOL, VL, VU
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IFAIL( * ), IWORK( * )
+      DOUBLE PRECISION   AB( LDAB, * ), Q( LDQ, * ), W( * ), WORK( * ),
+     $                   Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            ALLEIG, INDEIG, LOWER, TEST, VALEIG, WANTZ,
+     $                   LQUERY
+      CHARACTER          ORDER
+      INTEGER            I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+     $                   INDISP, INDIWO, INDWRK, ISCALE, ITMP1, J, JJ,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS,
+     $                   NSPLIT
+      DOUBLE PRECISION   ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+     $                   SIGMA, SMLNUM, TMP1, VLL, VUU
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, DLANSB
+      EXTERNAL           LSAME, DLAMCH, DLANSB, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DGEMV, DLACPY, DLASCL, DSCAL,
+     $                   DSTEBZ, DSTEIN, DSTEQR, DSTERF, DSWAP, XERBLA,
+     $                   DSYTRD_SB2ST
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      ALLEIG = LSAME( RANGE, 'A' )
+      VALEIG = LSAME( RANGE, 'V' )
+      INDEIG = LSAME( RANGE, 'I' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( LDAB.LT.KD+1 ) THEN
+         INFO = -7
+      ELSE IF( WANTZ .AND. LDQ.LT.MAX( 1, N ) ) THEN
+         INFO = -9
+      ELSE
+         IF( VALEIG ) THEN
+            IF( N.GT.0 .AND. VU.LE.VL )
+     $         INFO = -11
+         ELSE IF( INDEIG ) THEN
+            IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+               INFO = -12
+            ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+               INFO = -13
+            END IF
+         END IF
+      END IF
+      IF( INFO.EQ.0 ) THEN
+         IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) )
+     $      INFO = -18
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            WORK( 1 ) = LWMIN
+         ELSE
+            IB    = ILAENV( 18, 'DSYTRD_SB2ST', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+            LWMIN = 2*N + LHTRD + LWTRD
+            WORK( 1 )  = LWMIN
+         ENDIF
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -20
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSBEVX_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      M = 0
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         M = 1
+         IF( LOWER ) THEN
+            TMP1 = AB( 1, 1 )
+         ELSE
+            TMP1 = AB( KD+1, 1 )
+         END IF
+         IF( VALEIG ) THEN
+            IF( .NOT.( VL.LT.TMP1 .AND. VU.GE.TMP1 ) )
+     $         M = 0
+         END IF
+         IF( M.EQ.1 ) THEN
+            W( 1 ) = TMP1
+            IF( WANTZ )
+     $         Z( 1, 1 ) = ONE
+         END IF
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ISCALE = 0
+      ABSTLL = ABSTOL
+      IF( VALEIG ) THEN
+         VLL = VL
+         VUU = VU
+      ELSE
+         VLL = ZERO
+         VUU = ZERO
+      END IF
+      ANRM = DLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK )
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            CALL DLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         ELSE
+            CALL DLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         END IF
+         IF( ABSTOL.GT.0 )
+     $      ABSTLL = ABSTOL*SIGMA
+         IF( VALEIG ) THEN
+            VLL = VL*SIGMA
+            VUU = VU*SIGMA
+         END IF
+      END IF
+*
+*     Call DSYTRD_SB2ST to reduce symmetric band matrix to tridiagonal form.
+*
+      INDD    = 1
+      INDE    = INDD + N
+      INDHOUS = INDE + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL DSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, WORK( INDD ),
+     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     If all eigenvalues are desired and ABSTOL is less than or equal
+*     to zero, then call DSTERF or SSTEQR.  If this fails for some
+*     eigenvalue, then try DSTEBZ.
+*
+      TEST = .FALSE.
+      IF (INDEIG) THEN
+         IF (IL.EQ.1 .AND. IU.EQ.N) THEN
+            TEST = .TRUE.
+         END IF
+      END IF
+      IF ((ALLEIG .OR. TEST) .AND. (ABSTOL.LE.ZERO)) THEN
+         CALL DCOPY( N, WORK( INDD ), 1, W, 1 )
+         INDEE = INDWRK + 2*N
+         IF( .NOT.WANTZ ) THEN
+            CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+            CALL DSTERF( N, W, WORK( INDEE ), INFO )
+         ELSE
+            CALL DLACPY( 'A', N, N, Q, LDQ, Z, LDZ )
+            CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+            CALL DSTEQR( JOBZ, N, W, WORK( INDEE ), Z, LDZ,
+     $                   WORK( INDWRK ), INFO )
+            IF( INFO.EQ.0 ) THEN
+               DO 10 I = 1, N
+                  IFAIL( I ) = 0
+   10          CONTINUE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            M = N
+            GO TO 30
+         END IF
+         INFO = 0
+      END IF
+*
+*     Otherwise, call DSTEBZ and, if eigenvectors are desired, SSTEIN.
+*
+      IF( WANTZ ) THEN
+         ORDER = 'B'
+      ELSE
+         ORDER = 'E'
+      END IF
+      INDIBL = 1
+      INDISP = INDIBL + N
+      INDIWO = INDISP + N
+      CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+     $             WORK( INDD ), WORK( INDE ), M, NSPLIT, W,
+     $             IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ),
+     $             IWORK( INDIWO ), INFO )
+*
+      IF( WANTZ ) THEN
+         CALL DSTEIN( N, WORK( INDD ), WORK( INDE ), M, W,
+     $                IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+     $                WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO )
+*
+*        Apply orthogonal matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by DSTEIN.
+*
+         DO 20 J = 1, M
+            CALL DCOPY( N, Z( 1, J ), 1, WORK( 1 ), 1 )
+            CALL DGEMV( 'N', N, N, ONE, Q, LDQ, WORK, 1, ZERO,
+     $                  Z( 1, J ), 1 )
+   20    CONTINUE
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+   30 CONTINUE
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = M
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     If eigenvalues are not in order, then sort them, along with
+*     eigenvectors.
+*
+      IF( WANTZ ) THEN
+         DO 50 J = 1, M - 1
+            I = 0
+            TMP1 = W( J )
+            DO 40 JJ = J + 1, M
+               IF( W( JJ ).LT.TMP1 ) THEN
+                  I = JJ
+                  TMP1 = W( JJ )
+               END IF
+   40       CONTINUE
+*
+            IF( I.NE.0 ) THEN
+               ITMP1 = IWORK( INDIBL+I-1 )
+               W( I ) = W( J )
+               IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+               W( J ) = TMP1
+               IWORK( INDIBL+J-1 ) = ITMP1
+               CALL DSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+               IF( INFO.NE.0 ) THEN
+                  ITMP1 = IFAIL( I )
+                  IFAIL( I ) = IFAIL( J )
+                  IFAIL( J ) = ITMP1
+               END IF
+            END IF
+   50    CONTINUE
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of DSBEVX_2STAGE
+*
+      END
diff --git a/SRC/dsyev_2stage.f b/SRC/dsyev_2stage.f
new file mode 100644
index 00000000..a42e86d8
--- /dev/null
+++ b/SRC/dsyev_2stage.f
@@ -0,0 +1,348 @@
+*> \brief <b> DSYEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+*  @precisions fortran d -> s
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSYEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsyevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsyevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsyevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+*                                INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, LDA, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * ), W( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSYEV_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA, N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*>          orthonormal eigenvectors of the matrix A.
+*>          If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*>          or the upper triangle (if UPLO='U') of A, including the
+*>          diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension LWORK
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + 2*N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, the algorithm failed to converge; i
+*>                off-diagonal elements of an intermediate tridiagonal
+*>                form did not converge to zero.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup doubleSYeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+     $                         INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, LDA, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), W( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, LQUERY, WANTZ
+      INTEGER            IINFO, IMAX, INDE, INDTAU, INDWRK, ISCALE,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+      DOUBLE PRECISION   ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, DLANSY
+      EXTERNAL           LSAME, ILAENV, DLAMCH, DLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLASCL, DORGTR, DSCAL, DSTEQR, DSTERF,
+     $                   XERBLA, DSYTRD_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         KD    = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+         IB    = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+         LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWMIN = 2*N + LHTRD + LWTRD
+         WORK( 1 )  = LWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -8
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSYEV_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 ) THEN
+         RETURN
+      END IF
+*
+      IF( N.EQ.1 ) THEN
+         W( 1 ) = A( 1, 1 )
+         WORK( 1 ) = 2
+         IF( WANTZ )
+     $      A( 1, 1 ) = ONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = DLANSY( 'M', UPLO, N, A, LDA, WORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 )
+     $   CALL DLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+*     Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDTAU  = INDE + N
+      INDHOUS = INDTAU + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call DSTERF.  For eigenvectors, first call
+*     DORGTR to generate the orthogonal matrix, then call DSTEQR.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL DSTERF( N, W, WORK( INDE ), INFO )
+      ELSE
+*        Not available in this release, and agrument checking should not
+*        let it getting here
+         RETURN
+         CALL DORGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ),
+     $                LLWORK, IINFO )
+         CALL DSTEQR( JOBZ, N, W, WORK( INDE ), A, LDA, WORK( INDTAU ),
+     $                INFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = N
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of DSYEV_2STAGE
+*
+      END
diff --git a/SRC/dsyevd_2stage.f b/SRC/dsyevd_2stage.f
new file mode 100644
index 00000000..161f0e9e
--- /dev/null
+++ b/SRC/dsyevd_2stage.f
@@ -0,0 +1,406 @@
+*> \brief <b> DSYEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+*  @precisions fortran d -> s
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSYEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsyevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsyevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsyevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DSYEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+*                                IWORK, LIWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, LDA, LIWORK, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IWORK( * )
+*       DOUBLE PRECISION   A( LDA, * ), W( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSYEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal. If eigenvectors are desired, it uses a
+*> divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA, N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*>          orthonormal eigenvectors of the matrix A.
+*>          If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*>          or the upper triangle (if UPLO='U') of A, including the
+*>          diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array,
+*>                                         dimension (LWORK)
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N+1
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + 2*N+1
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be at least
+*>                                                1 + 6*N + 2*N**2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK and IWORK
+*>          arrays, returns these values as the first entries of the WORK
+*>          and IWORK arrays, and no error message related to LWORK or
+*>          LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.
+*>          If N <= 1,                LIWORK must be at least 1.
+*>          If JOBZ  = 'N' and N > 1, LIWORK must be at least 1.
+*>          If JOBZ  = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK and IWORK arrays, and no error message related to
+*>          LWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i and JOBZ = 'N', then the algorithm failed
+*>                to converge; i off-diagonal elements of an intermediate
+*>                tridiagonal form did not converge to zero;
+*>                if INFO = i and JOBZ = 'V', then the algorithm failed
+*>                to compute an eigenvalue while working on the submatrix
+*>                lying in rows and columns INFO/(N+1) through
+*>                mod(INFO,N+1).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date September 2012
+*
+*> \ingroup doubleSYeigen
+*
+*> \par Contributors:
+*  ==================
+*>
+*> Jeff Rutter, Computer Science Division, University of California
+*> at Berkeley, USA \n
+*>  Modified by Francoise Tisseur, University of Tennessee \n
+*>  Modified description of INFO. Sven, 16 Feb 05. \n
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DSYEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+     $                          IWORK, LIWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.4.2) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     September 2012
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, LDA, LIWORK, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      DOUBLE PRECISION   A( LDA, * ), W( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+*
+      LOGICAL            LOWER, LQUERY, WANTZ
+      INTEGER            IINFO, INDE, INDTAU, INDWK2, INDWRK, ISCALE,
+     $                   LIWMIN, LLWORK, LLWRK2, LWMIN,
+     $                   LHTRD, LWTRD, KD, IB, INDHOUS
+      DOUBLE PRECISION   ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, DLANSY
+      EXTERNAL           LSAME, DLAMCH, DLANSY, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLACPY, DLASCL, DORMTR, DSCAL, DSTEDC, DSTERF,
+     $                   DSYTRD_2STAGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LIWMIN = 1
+            LWMIN = 1
+         ELSE
+            KD    = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+            IB    = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            IF( WANTZ ) THEN
+               LIWMIN = 3 + 5*N
+               LWMIN = 1 + 6*N + 2*N**2
+            ELSE
+               LIWMIN = 1
+               LWMIN = 2*N + 1 + LHTRD + LWTRD
+            END IF
+         END IF
+         WORK( 1 )  = LWMIN
+         IWORK( 1 ) = LIWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -8
+         ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -10
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSYEVD_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         W( 1 ) = A( 1, 1 )
+         IF( WANTZ )
+     $      A( 1, 1 ) = ONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = DLANSY( 'M', UPLO, N, A, LDA, WORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 )
+     $   CALL DLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+*     Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDTAU  = INDE + N
+      INDHOUS = INDTAU + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+      INDWK2  = INDWRK + N*N
+      LLWRK2  = LWORK - INDWK2 + 1
+*
+      CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call DSTERF.  For eigenvectors, first call
+*     DSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
+*     tridiagonal matrix, then call DORMTR to multiply it by the
+*     Householder transformations stored in A.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL DSTERF( N, W, WORK( INDE ), INFO )
+      ELSE
+*        Not available in this release, and agrument checking should not
+*        let it getting here
+         RETURN
+         CALL DSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N,
+     $                WORK( INDWK2 ), LLWRK2, IWORK, LIWORK, INFO )
+         CALL DORMTR( 'L', UPLO, 'N', N, N, A, LDA, WORK( INDTAU ),
+     $                WORK( INDWRK ), N, WORK( INDWK2 ), LLWRK2, IINFO )
+         CALL DLACPY( 'A', N, N, WORK( INDWRK ), N, A, LDA )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 )
+     $   CALL DSCAL( N, ONE / SIGMA, W, 1 )
+*
+      WORK( 1 )  = LWMIN
+      IWORK( 1 ) = LIWMIN
+*
+      RETURN
+*
+*     End of DSYEVD_2STAGE
+*
+      END
diff --git a/SRC/dsyevr_2stage.f b/SRC/dsyevr_2stage.f
new file mode 100644
index 00000000..c1b468dc
--- /dev/null
+++ b/SRC/dsyevr_2stage.f
@@ -0,0 +1,740 @@
+*> \brief <b> DSYEVR_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+*  @precisions fortran d -> s
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSYEVR_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsyevr_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsyevr_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsyevr_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DSYEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+*                          IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, WORK,
+*                          LWORK, IWORK, LIWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, RANGE, UPLO
+*       INTEGER            IL, INFO, IU, LDA, LDZ, LIWORK, LWORK, M, N
+*       DOUBLE PRECISION   ABSTOL, VL, VU
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            ISUPPZ( * ), IWORK( * )
+*       DOUBLE PRECISION   A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSYEVR_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  Eigenvalues and eigenvectors can be
+*> selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*>
+*> DSYEVR_2STAGE first reduces the matrix A to tridiagonal form T with a call
+*> to DSYTRD.  Then, whenever possible, DSYEVR_2STAGE calls DSTEMR to compute
+*> the eigenspectrum using Relatively Robust Representations.  DSTEMR
+*> computes eigenvalues by the dqds algorithm, while orthogonal
+*> eigenvectors are computed from various "good" L D L^T representations
+*> (also known as Relatively Robust Representations). Gram-Schmidt
+*> orthogonalization is avoided as far as possible. More specifically,
+*> the various steps of the algorithm are as follows.
+*>
+*> For each unreduced block (submatrix) of T,
+*>    (a) Compute T - sigma I  = L D L^T, so that L and D
+*>        define all the wanted eigenvalues to high relative accuracy.
+*>        This means that small relative changes in the entries of D and L
+*>        cause only small relative changes in the eigenvalues and
+*>        eigenvectors. The standard (unfactored) representation of the
+*>        tridiagonal matrix T does not have this property in general.
+*>    (b) Compute the eigenvalues to suitable accuracy.
+*>        If the eigenvectors are desired, the algorithm attains full
+*>        accuracy of the computed eigenvalues only right before
+*>        the corresponding vectors have to be computed, see steps c) and d).
+*>    (c) For each cluster of close eigenvalues, select a new
+*>        shift close to the cluster, find a new factorization, and refine
+*>        the shifted eigenvalues to suitable accuracy.
+*>    (d) For each eigenvalue with a large enough relative separation compute
+*>        the corresponding eigenvector by forming a rank revealing twisted
+*>        factorization. Go back to (c) for any clusters that remain.
+*>
+*> The desired accuracy of the output can be specified by the input
+*> parameter ABSTOL.
+*>
+*> For more details, see DSTEMR's documentation and:
+*> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+*>   to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+*>   Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+*> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+*>   Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+*>   2004.  Also LAPACK Working Note 154.
+*> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+*>   tridiagonal eigenvalue/eigenvector problem",
+*>   Computer Science Division Technical Report No. UCB/CSD-97-971,
+*>   UC Berkeley, May 1997.
+*>
+*>
+*> Note 1 : DSYEVR_2STAGE calls DSTEMR when the full spectrum is requested
+*> on machines which conform to the ieee-754 floating point standard.
+*> DSYEVR_2STAGE calls DSTEBZ and SSTEIN on non-ieee machines and
+*> when partial spectrum requests are made.
+*>
+*> Normal execution of DSTEMR may create NaNs and infinities and
+*> hence may abort due to a floating point exception in environments
+*> which do not handle NaNs and infinities in the ieee standard default
+*> manner.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*>          RANGE is CHARACTER*1
+*>          = 'A': all eigenvalues will be found.
+*>          = 'V': all eigenvalues in the half-open interval (VL,VU]
+*>                 will be found.
+*>          = 'I': the IL-th through IU-th eigenvalues will be found.
+*>          For RANGE = 'V' or 'I' and IU - IL < N - 1, DSTEBZ and
+*>          DSTEIN are called
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA, N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, the lower triangle (if UPLO='L') or the upper
+*>          triangle (if UPLO='U') of A, including the diagonal, is
+*>          destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*>          VL is DOUBLE PRECISION
+*>          If RANGE='V', the lower bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*>          VU is DOUBLE PRECISION
+*>          If RANGE='V', the upper bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*>          IL is INTEGER
+*>          If RANGE='I', the index of the
+*>          smallest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*>          IU is INTEGER
+*>          If RANGE='I', the index of the
+*>          largest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION
+*>          The absolute error tolerance for the eigenvalues.
+*>          An approximate eigenvalue is accepted as converged
+*>          when it is determined to lie in an interval [a,b]
+*>          of width less than or equal to
+*>
+*>                  ABSTOL + EPS *   max( |a|,|b| ) ,
+*>
+*>          where EPS is the machine precision.  If ABSTOL is less than
+*>          or equal to zero, then  EPS*|T|  will be used in its place,
+*>          where |T| is the 1-norm of the tridiagonal matrix obtained
+*>          by reducing A to tridiagonal form.
+*>
+*>          See "Computing Small Singular Values of Bidiagonal Matrices
+*>          with Guaranteed High Relative Accuracy," by Demmel and
+*>          Kahan, LAPACK Working Note #3.
+*>
+*>          If high relative accuracy is important, set ABSTOL to
+*>          DLAMCH( 'Safe minimum' ).  Doing so will guarantee that
+*>          eigenvalues are computed to high relative accuracy when
+*>          possible in future releases.  The current code does not
+*>          make any guarantees about high relative accuracy, but
+*>          future releases will. See J. Barlow and J. Demmel,
+*>          "Computing Accurate Eigensystems of Scaled Diagonally
+*>          Dominant Matrices", LAPACK Working Note #7, for a discussion
+*>          of which matrices define their eigenvalues to high relative
+*>          accuracy.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*>          M is INTEGER
+*>          The total number of eigenvalues found.  0 <= M <= N.
+*>          If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          The first M elements contain the selected eigenvalues in
+*>          ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is DOUBLE PRECISION array, dimension (LDZ, max(1,M))
+*>          If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*>          contain the orthonormal eigenvectors of the matrix A
+*>          corresponding to the selected eigenvalues, with the i-th
+*>          column of Z holding the eigenvector associated with W(i).
+*>          If JOBZ = 'N', then Z is not referenced.
+*>          Note: the user must ensure that at least max(1,M) columns are
+*>          supplied in the array Z; if RANGE = 'V', the exact value of M
+*>          is not known in advance and an upper bound must be used.
+*>          Supplying N columns is always safe.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] ISUPPZ
+*> \verbatim
+*>          ISUPPZ is INTEGER array, dimension ( 2*max(1,M) )
+*>          The support of the eigenvectors in Z, i.e., the indices
+*>          indicating the nonzero elements in Z. The i-th eigenvector
+*>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
+*>          ISUPPZ( 2*i ). This is an output of DSTEMR (tridiagonal
+*>          matrix). The support of the eigenvectors of A is typically 
+*>          1:N because of the orthogonal transformations applied by DORMTR.
+*>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, 26*N, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + 5*N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + 5*N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal size of the IWORK array,
+*>          returns this value as the first entry of the IWORK array, and
+*>          no error message related to LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  Internal error
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup doubleSYeigen
+*
+*> \par Contributors:
+*  ==================
+*>
+*>     Inderjit Dhillon, IBM Almaden, USA \n
+*>     Osni Marques, LBNL/NERSC, USA \n
+*>     Ken Stanley, Computer Science Division, University of
+*>       California at Berkeley, USA \n
+*>     Jason Riedy, Computer Science Division, University of
+*>       California at Berkeley, USA \n
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DSYEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+     $                   IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, WORK,
+     $                   LWORK, IWORK, LIWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, RANGE, UPLO
+      INTEGER            IL, INFO, IU, LDA, LDZ, LIWORK, LWORK, M, N
+      DOUBLE PRECISION   ABSTOL, VL, VU
+*     ..
+*     .. Array Arguments ..
+      INTEGER            ISUPPZ( * ), IWORK( * )
+      DOUBLE PRECISION   A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            ALLEIG, INDEIG, LOWER, LQUERY, VALEIG, WANTZ,
+     $                   TRYRAC
+      CHARACTER          ORDER
+      INTEGER            I, IEEEOK, IINFO, IMAX, INDD, INDDD, INDE,
+     $                   INDEE, INDIBL, INDIFL, INDISP, INDIWO, INDTAU,
+     $                   INDWK, INDWKN, ISCALE, J, JJ, LIWMIN,
+     $                   LLWORK, LLWRKN, LWMIN, NSPLIT,
+     $                   LHTRD, LWTRD, KD, IB, INDHOUS
+      DOUBLE PRECISION   ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+     $                   SIGMA, SMLNUM, TMP1, VLL, VUU
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, DLANSY
+      EXTERNAL           LSAME, ILAENV, DLAMCH, DLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DORMTR, DSCAL, DSTEBZ, DSTEMR, DSTEIN,
+     $                   DSTERF, DSWAP, DSYTRD_2STAGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IEEEOK = ILAENV( 10, 'DSYEVR', 'N', 1, 2, 3, 4 )
+*
+      LOWER = LSAME( UPLO, 'L' )
+      WANTZ = LSAME( JOBZ, 'V' )
+      ALLEIG = LSAME( RANGE, 'A' )
+      VALEIG = LSAME( RANGE, 'V' )
+      INDEIG = LSAME( RANGE, 'I' )
+*
+      LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) )
+*
+      KD     = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+      IB     = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+      LHTRD  = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+      LWTRD  = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+      LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
+      LIWMIN = MAX( 1, 10*N )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE
+         IF( VALEIG ) THEN
+            IF( N.GT.0 .AND. VU.LE.VL )
+     $         INFO = -8
+         ELSE IF( INDEIG ) THEN
+            IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+               INFO = -9
+            ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+               INFO = -10
+            END IF
+         END IF
+      END IF
+      IF( INFO.EQ.0 ) THEN
+         IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+            INFO = -15
+         ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -18
+         ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -20
+         END IF
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+*         NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 )
+*         NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) )
+*         LWKOPT = MAX( ( NB+1 )*N, LWMIN )
+         WORK( 1 ) = LWMIN
+         IWORK( 1 ) = LIWMIN
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSYEVR_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      M = 0
+      IF( N.EQ.0 ) THEN
+         WORK( 1 ) = 1
+         RETURN
+      END IF
+*
+      IF( N.EQ.1 ) THEN
+         WORK( 1 ) = 7
+         IF( ALLEIG .OR. INDEIG ) THEN
+            M = 1
+            W( 1 ) = A( 1, 1 )
+         ELSE
+            IF( VL.LT.A( 1, 1 ) .AND. VU.GE.A( 1, 1 ) ) THEN
+               M = 1
+               W( 1 ) = A( 1, 1 )
+            END IF
+         END IF
+         IF( WANTZ ) THEN
+            Z( 1, 1 ) = ONE
+            ISUPPZ( 1 ) = 1
+            ISUPPZ( 2 ) = 1
+         END IF
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ISCALE = 0
+      ABSTLL = ABSTOL
+      IF (VALEIG) THEN
+         VLL = VL
+         VUU = VU
+      END IF
+      ANRM = DLANSY( 'M', UPLO, N, A, LDA, WORK )
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            DO 10 J = 1, N
+               CALL DSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+   10       CONTINUE
+         ELSE
+            DO 20 J = 1, N
+               CALL DSCAL( J, SIGMA, A( 1, J ), 1 )
+   20       CONTINUE
+         END IF
+         IF( ABSTOL.GT.0 )
+     $      ABSTLL = ABSTOL*SIGMA
+         IF( VALEIG ) THEN
+            VLL = VL*SIGMA
+            VUU = VU*SIGMA
+         END IF
+      END IF
+
+*     Initialize indices into workspaces.  Note: The IWORK indices are
+*     used only if DSTERF or DSTEMR fail.
+
+*     WORK(INDTAU:INDTAU+N-1) stores the scalar factors of the
+*     elementary reflectors used in DSYTRD.
+      INDTAU = 1
+*     WORK(INDD:INDD+N-1) stores the tridiagonal's diagonal entries.
+      INDD = INDTAU + N
+*     WORK(INDE:INDE+N-1) stores the off-diagonal entries of the
+*     tridiagonal matrix from DSYTRD.
+      INDE = INDD + N
+*     WORK(INDDD:INDDD+N-1) is a copy of the diagonal entries over
+*     -written by DSTEMR (the DSTERF path copies the diagonal to W).
+      INDDD = INDE + N
+*     WORK(INDEE:INDEE+N-1) is a copy of the off-diagonal entries over
+*     -written while computing the eigenvalues in DSTERF and DSTEMR.
+      INDEE = INDDD + N
+*     INDHOUS is the starting offset Householder storage of stage 2
+      INDHOUS = INDEE + N
+*     INDWK is the starting offset of the left-over workspace, and
+*     LLWORK is the remaining workspace size.
+      INDWK  = INDHOUS + LHTRD
+      LLWORK = LWORK - INDWK + 1
+
+
+*     IWORK(INDIBL:INDIBL+M-1) corresponds to IBLOCK in DSTEBZ and
+*     stores the block indices of each of the M<=N eigenvalues.
+      INDIBL = 1
+*     IWORK(INDISP:INDISP+NSPLIT-1) corresponds to ISPLIT in DSTEBZ and
+*     stores the starting and finishing indices of each block.
+      INDISP = INDIBL + N
+*     IWORK(INDIFL:INDIFL+N-1) stores the indices of eigenvectors
+*     that corresponding to eigenvectors that fail to converge in
+*     DSTEIN.  This information is discarded; if any fail, the driver
+*     returns INFO > 0.
+      INDIFL = INDISP + N
+*     INDIWO is the offset of the remaining integer workspace.
+      INDIWO = INDIFL + N
+
+*
+*     Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+*
+      CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), 
+     $                    WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
+     $                    LHTRD, WORK( INDWK ), LLWORK, IINFO )
+*
+*     If all eigenvalues are desired
+*     then call DSTERF or DSTEMR and DORMTR.
+*
+      IF( ( ALLEIG .OR. ( INDEIG .AND. IL.EQ.1 .AND. IU.EQ.N ) ) .AND.
+     $    IEEEOK.EQ.1 ) THEN
+         IF( .NOT.WANTZ ) THEN
+            CALL DCOPY( N, WORK( INDD ), 1, W, 1 )
+            CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+            CALL DSTERF( N, W, WORK( INDEE ), INFO )
+         ELSE
+            CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+            CALL DCOPY( N, WORK( INDD ), 1, WORK( INDDD ), 1 )
+*
+            IF (ABSTOL .LE. TWO*N*EPS) THEN
+               TRYRAC = .TRUE.
+            ELSE
+               TRYRAC = .FALSE.
+            END IF
+            CALL DSTEMR( JOBZ, 'A', N, WORK( INDDD ), WORK( INDEE ),
+     $                   VL, VU, IL, IU, M, W, Z, LDZ, N, ISUPPZ,
+     $                   TRYRAC, WORK( INDWK ), LWORK, IWORK, LIWORK,
+     $                   INFO )
+*
+*
+*
+*        Apply orthogonal matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by DSTEMR.
+*
+            IF( WANTZ .AND. INFO.EQ.0 ) THEN
+               INDWKN = INDE
+               LLWRKN = LWORK - INDWKN + 1
+               CALL DORMTR( 'L', UPLO, 'N', N, M, A, LDA,
+     $                      WORK( INDTAU ), Z, LDZ, WORK( INDWKN ),
+     $                      LLWRKN, IINFO )
+            END IF
+         END IF
+*
+*
+         IF( INFO.EQ.0 ) THEN
+*           Everything worked.  Skip DSTEBZ/DSTEIN.  IWORK(:) are
+*           undefined.
+            M = N
+            GO TO 30
+         END IF
+         INFO = 0
+      END IF
+*
+*     Otherwise, call DSTEBZ and, if eigenvectors are desired, DSTEIN.
+*     Also call DSTEBZ and DSTEIN if DSTEMR fails.
+*
+      IF( WANTZ ) THEN
+         ORDER = 'B'
+      ELSE
+         ORDER = 'E'
+      END IF
+
+      CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+     $             WORK( INDD ), WORK( INDE ), M, NSPLIT, W,
+     $             IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWK ),
+     $             IWORK( INDIWO ), INFO )
+*
+      IF( WANTZ ) THEN
+         CALL DSTEIN( N, WORK( INDD ), WORK( INDE ), M, W,
+     $                IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+     $                WORK( INDWK ), IWORK( INDIWO ), IWORK( INDIFL ),
+     $                INFO )
+*
+*        Apply orthogonal matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by DSTEIN.
+*
+         INDWKN = INDE
+         LLWRKN = LWORK - INDWKN + 1
+         CALL DORMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+     $                LDZ, WORK( INDWKN ), LLWRKN, IINFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+*  Jump here if DSTEMR/DSTEIN succeeded.
+   30 CONTINUE
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = M
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     If eigenvalues are not in order, then sort them, along with
+*     eigenvectors.  Note: We do not sort the IFAIL portion of IWORK.
+*     It may not be initialized (if DSTEMR/DSTEIN succeeded), and we do
+*     not return this detailed information to the user.
+*
+      IF( WANTZ ) THEN
+         DO 50 J = 1, M - 1
+            I = 0
+            TMP1 = W( J )
+            DO 40 JJ = J + 1, M
+               IF( W( JJ ).LT.TMP1 ) THEN
+                  I = JJ
+                  TMP1 = W( JJ )
+               END IF
+   40       CONTINUE
+*
+            IF( I.NE.0 ) THEN
+               W( I ) = W( J )
+               W( J ) = TMP1
+               CALL DSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+            END IF
+   50    CONTINUE
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+      IWORK( 1 ) = LIWMIN
+*
+      RETURN
+*
+*     End of DSYEVR_2STAGE
+*
+      END
diff --git a/SRC/dsyevx_2stage.f b/SRC/dsyevx_2stage.f
new file mode 100644
index 00000000..2c52e9e3
--- /dev/null
+++ b/SRC/dsyevx_2stage.f
@@ -0,0 +1,608 @@
+*> \brief <b> DSYEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+*  @precisions fortran d -> s
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSYEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsyevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsyevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsyevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DSYEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+*                                 IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+*                                 LWORK, IWORK, IFAIL, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, RANGE, UPLO
+*       INTEGER            IL, INFO, IU, LDA, LDZ, LWORK, M, N
+*       DOUBLE PRECISION   ABSTOL, VL, VU
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IFAIL( * ), IWORK( * )
+*       DOUBLE PRECISION   A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSYEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  Eigenvalues and eigenvectors can be
+*> selected by specifying either a range of values or a range of indices
+*> for the desired eigenvalues.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*>          RANGE is CHARACTER*1
+*>          = 'A': all eigenvalues will be found.
+*>          = 'V': all eigenvalues in the half-open interval (VL,VU]
+*>                 will be found.
+*>          = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA, N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, the lower triangle (if UPLO='L') or the upper
+*>          triangle (if UPLO='U') of A, including the diagonal, is
+*>          destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*>          VL is DOUBLE PRECISION
+*>          If RANGE='V', the lower bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*>          VU is DOUBLE PRECISION
+*>          If RANGE='V', the upper bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*>          IL is INTEGER
+*>          If RANGE='I', the index of the
+*>          smallest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*>          IU is INTEGER
+*>          If RANGE='I', the index of the
+*>          largest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION
+*>          The absolute error tolerance for the eigenvalues.
+*>          An approximate eigenvalue is accepted as converged
+*>          when it is determined to lie in an interval [a,b]
+*>          of width less than or equal to
+*>
+*>                  ABSTOL + EPS *   max( |a|,|b| ) ,
+*>
+*>          where EPS is the machine precision.  If ABSTOL is less than
+*>          or equal to zero, then  EPS*|T|  will be used in its place,
+*>          where |T| is the 1-norm of the tridiagonal matrix obtained
+*>          by reducing A to tridiagonal form.
+*>
+*>          Eigenvalues will be computed most accurately when ABSTOL is
+*>          set to twice the underflow threshold 2*DLAMCH('S'), not zero.
+*>          If this routine returns with INFO>0, indicating that some
+*>          eigenvectors did not converge, try setting ABSTOL to
+*>          2*DLAMCH('S').
+*>
+*>          See "Computing Small Singular Values of Bidiagonal Matrices
+*>          with Guaranteed High Relative Accuracy," by Demmel and
+*>          Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*>          M is INTEGER
+*>          The total number of eigenvalues found.  0 <= M <= N.
+*>          If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          On normal exit, the first M elements contain the selected
+*>          eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is DOUBLE PRECISION array, dimension (LDZ, max(1,M))
+*>          If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*>          contain the orthonormal eigenvectors of the matrix A
+*>          corresponding to the selected eigenvalues, with the i-th
+*>          column of Z holding the eigenvector associated with W(i).
+*>          If an eigenvector fails to converge, then that column of Z
+*>          contains the latest approximation to the eigenvector, and the
+*>          index of the eigenvector is returned in IFAIL.
+*>          If JOBZ = 'N', then Z is not referenced.
+*>          Note: the user must ensure that at least max(1,M) columns are
+*>          supplied in the array Z; if RANGE = 'V', the exact value of M
+*>          is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, 8*N, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + 3*N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + 3*N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*>          IFAIL is INTEGER array, dimension (N)
+*>          If JOBZ = 'V', then if INFO = 0, the first M elements of
+*>          IFAIL are zero.  If INFO > 0, then IFAIL contains the
+*>          indices of the eigenvectors that failed to converge.
+*>          If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, then i eigenvectors failed to converge.
+*>                Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup doubleSYeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DSYEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+     $                          IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+     $                          LWORK, IWORK, IFAIL, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, RANGE, UPLO
+      INTEGER            IL, INFO, IU, LDA, LDZ, LWORK, M, N
+      DOUBLE PRECISION   ABSTOL, VL, VU
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IFAIL( * ), IWORK( * )
+      DOUBLE PRECISION   A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG,
+     $                   WANTZ
+      CHARACTER          ORDER
+      INTEGER            I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+     $                   INDISP, INDIWO, INDTAU, INDWKN, INDWRK, ISCALE,
+     $                   ITMP1, J, JJ, LLWORK, LLWRKN,
+     $                   NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+      DOUBLE PRECISION   ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+     $                   SIGMA, SMLNUM, TMP1, VLL, VUU
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, DLANSY
+      EXTERNAL           LSAME, ILAENV, DLAMCH, DLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DLACPY, DORGTR, DORMTR, DSCAL, DSTEBZ,
+     $                   DSTEIN, DSTEQR, DSTERF, DSWAP, XERBLA,
+     $                   DSYTRD_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      LOWER = LSAME( UPLO, 'L' )
+      WANTZ = LSAME( JOBZ, 'V' )
+      ALLEIG = LSAME( RANGE, 'A' )
+      VALEIG = LSAME( RANGE, 'V' )
+      INDEIG = LSAME( RANGE, 'I' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE
+         IF( VALEIG ) THEN
+            IF( N.GT.0 .AND. VU.LE.VL )
+     $         INFO = -8
+         ELSE IF( INDEIG ) THEN
+            IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+               INFO = -9
+            ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+               INFO = -10
+            END IF
+         END IF
+      END IF
+      IF( INFO.EQ.0 ) THEN
+         IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            WORK( 1 ) = LWMIN
+         ELSE
+            KD    = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+            IB    = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            LWMIN = MAX( 8*N, 3*N + LHTRD + LWTRD )
+            WORK( 1 )  = LWMIN
+         END IF
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -17
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSYEVX_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      M = 0
+      IF( N.EQ.0 ) THEN
+         RETURN
+      END IF
+*
+      IF( N.EQ.1 ) THEN
+         IF( ALLEIG .OR. INDEIG ) THEN
+            M = 1
+            W( 1 ) = A( 1, 1 )
+         ELSE
+            IF( VL.LT.A( 1, 1 ) .AND. VU.GE.A( 1, 1 ) ) THEN
+               M = 1
+               W( 1 ) = A( 1, 1 )
+            END IF
+         END IF
+         IF( WANTZ )
+     $      Z( 1, 1 ) = ONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ISCALE = 0
+      ABSTLL = ABSTOL
+      IF( VALEIG ) THEN
+         VLL = VL
+         VUU = VU
+      END IF
+      ANRM = DLANSY( 'M', UPLO, N, A, LDA, WORK )
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            DO 10 J = 1, N
+               CALL DSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+   10       CONTINUE
+         ELSE
+            DO 20 J = 1, N
+               CALL DSCAL( J, SIGMA, A( 1, J ), 1 )
+   20       CONTINUE
+         END IF
+         IF( ABSTOL.GT.0 )
+     $      ABSTLL = ABSTOL*SIGMA
+         IF( VALEIG ) THEN
+            VLL = VL*SIGMA
+            VUU = VU*SIGMA
+         END IF
+      END IF
+*
+*     Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+      INDTAU  = 1
+      INDE    = INDTAU + N
+      INDD    = INDE + N
+      INDHOUS = INDD + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), 
+     $                    WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
+     $                    LHTRD, WORK( INDWRK ), LLWORK, IINFO )
+*
+*     If all eigenvalues are desired and ABSTOL is less than or equal to
+*     zero, then call DSTERF or DORGTR and SSTEQR.  If this fails for
+*     some eigenvalue, then try DSTEBZ.
+*
+      TEST = .FALSE.
+      IF( INDEIG ) THEN
+         IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+            TEST = .TRUE.
+         END IF
+      END IF
+      IF( ( ALLEIG .OR. TEST ) .AND. ( ABSTOL.LE.ZERO ) ) THEN
+         CALL DCOPY( N, WORK( INDD ), 1, W, 1 )
+         INDEE = INDWRK + 2*N
+         IF( .NOT.WANTZ ) THEN
+            CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+            CALL DSTERF( N, W, WORK( INDEE ), INFO )
+         ELSE
+            CALL DLACPY( 'A', N, N, A, LDA, Z, LDZ )
+            CALL DORGTR( UPLO, N, Z, LDZ, WORK( INDTAU ),
+     $                   WORK( INDWRK ), LLWORK, IINFO )
+            CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+            CALL DSTEQR( JOBZ, N, W, WORK( INDEE ), Z, LDZ,
+     $                   WORK( INDWRK ), INFO )
+            IF( INFO.EQ.0 ) THEN
+               DO 30 I = 1, N
+                  IFAIL( I ) = 0
+   30          CONTINUE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            M = N
+            GO TO 40
+         END IF
+         INFO = 0
+      END IF
+*
+*     Otherwise, call DSTEBZ and, if eigenvectors are desired, SSTEIN.
+*
+      IF( WANTZ ) THEN
+         ORDER = 'B'
+      ELSE
+         ORDER = 'E'
+      END IF
+      INDIBL = 1
+      INDISP = INDIBL + N
+      INDIWO = INDISP + N
+      CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+     $             WORK( INDD ), WORK( INDE ), M, NSPLIT, W,
+     $             IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ),
+     $             IWORK( INDIWO ), INFO )
+*
+      IF( WANTZ ) THEN
+         CALL DSTEIN( N, WORK( INDD ), WORK( INDE ), M, W,
+     $                IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+     $                WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO )
+*
+*        Apply orthogonal matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by DSTEIN.
+*
+         INDWKN = INDE
+         LLWRKN = LWORK - INDWKN + 1
+         CALL DORMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+     $                LDZ, WORK( INDWKN ), LLWRKN, IINFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+   40 CONTINUE
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = M
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     If eigenvalues are not in order, then sort them, along with
+*     eigenvectors.
+*
+      IF( WANTZ ) THEN
+         DO 60 J = 1, M - 1
+            I = 0
+            TMP1 = W( J )
+            DO 50 JJ = J + 1, M
+               IF( W( JJ ).LT.TMP1 ) THEN
+                  I = JJ
+                  TMP1 = W( JJ )
+               END IF
+   50       CONTINUE
+*
+            IF( I.NE.0 ) THEN
+               ITMP1 = IWORK( INDIBL+I-1 )
+               W( I ) = W( J )
+               IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+               W( J ) = TMP1
+               IWORK( INDIBL+J-1 ) = ITMP1
+               CALL DSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+               IF( INFO.NE.0 ) THEN
+                  ITMP1 = IFAIL( I )
+                  IFAIL( I ) = IFAIL( J )
+                  IFAIL( J ) = ITMP1
+               END IF
+            END IF
+   60    CONTINUE
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of DSYEVX_2STAGE
+*
+      END
diff --git a/SRC/dsygv_2stage.f b/SRC/dsygv_2stage.f
new file mode 100644
index 00000000..2c79ec8a
--- /dev/null
+++ b/SRC/dsygv_2stage.f
@@ -0,0 +1,370 @@
+*> \brief \b DSYGV_2STAGE
+*
+*  @precisions fortran d -> s
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSYGV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsygv_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsygv_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsygv_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DSYGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+*                                WORK, LWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, ITYPE, LDA, LDB, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), W( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSYGV_2STAGE computes all the eigenvalues, and optionally, the eigenvectors
+*> of a real generalized symmetric-definite eigenproblem, of the form
+*> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+*> Here A and B are assumed to be symmetric and B is also
+*> positive definite.
+*> This routine use the 2stage technique for the reduction to tridiagonal
+*> which showed higher performance on recent architecture and for large
+*  sizes N>2000.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] ITYPE
+*> \verbatim
+*>          ITYPE is INTEGER
+*>          Specifies the problem type to be solved:
+*>          = 1:  A*x = (lambda)*B*x
+*>          = 2:  A*B*x = (lambda)*x
+*>          = 3:  B*A*x = (lambda)*x
+*> \endverbatim
+*>
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangles of A and B are stored;
+*>          = 'L':  Lower triangles of A and B are stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrices A and B.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA, N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>
+*>          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*>          matrix Z of eigenvectors.  The eigenvectors are normalized
+*>          as follows:
+*>          if ITYPE = 1 or 2, Z**T*B*Z = I;
+*>          if ITYPE = 3, Z**T*inv(B)*Z = I.
+*>          If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
+*>          or the lower triangle (if UPLO='L') of A, including the
+*>          diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (LDB, N)
+*>          On entry, the symmetric positive definite matrix B.
+*>          If UPLO = 'U', the leading N-by-N upper triangular part of B
+*>          contains the upper triangular part of the matrix B.
+*>          If UPLO = 'L', the leading N-by-N lower triangular part of B
+*>          contains the lower triangular part of the matrix B.
+*>
+*>          On exit, if INFO <= N, the part of B containing the matrix is
+*>          overwritten by the triangular factor U or L from the Cholesky
+*>          factorization B = U**T*U or B = L*L**T.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + 2*N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  DPOTRF or DSYEV returned an error code:
+*>             <= N:  if INFO = i, DSYEV failed to converge;
+*>                    i off-diagonal elements of an intermediate
+*>                    tridiagonal form did not converge to zero;
+*>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
+*>                    minor of order i of B is not positive definite.
+*>                    The factorization of B could not be completed and
+*>                    no eigenvalues or eigenvectors were computed.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup doubleSYeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DSYGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+     $                         WORK, LWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, ITYPE, LDA, LDB, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), W( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE
+      PARAMETER          ( ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, UPPER, WANTZ
+      CHARACTER          TRANS
+      INTEGER            NEIG, LWMIN, LHTRD, LWTRD, KD, IB 
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DPOTRF, DSYGST, DTRMM, DTRSM, XERBLA,
+     $                   DSYEV_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      UPPER = LSAME( UPLO, 'U' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( ITYPE.LT.1 .OR. ITYPE.GT.3 ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( UPPER .OR. LSAME( UPLO, 'L' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         KD    = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+         IB    = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+         LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWMIN = 2*N + LHTRD + LWTRD
+         WORK( 1 )  = LWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSYGV_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+*     Form a Cholesky factorization of B.
+*
+      CALL DPOTRF( UPLO, N, B, LDB, INFO )
+      IF( INFO.NE.0 ) THEN
+         INFO = N + INFO
+         RETURN
+      END IF
+*
+*     Transform problem to standard eigenvalue problem and solve.
+*
+      CALL DSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
+      CALL DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, INFO )
+*
+      IF( WANTZ ) THEN
+*
+*        Backtransform eigenvectors to the original problem.
+*
+         NEIG = N
+         IF( INFO.GT.0 )
+     $      NEIG = INFO - 1
+         IF( ITYPE.EQ.1 .OR. ITYPE.EQ.2 ) THEN
+*
+*           For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
+*           backtransform eigenvectors: x = inv(L)**T*y or inv(U)*y
+*
+            IF( UPPER ) THEN
+               TRANS = 'N'
+            ELSE
+               TRANS = 'T'
+            END IF
+*
+            CALL DTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+     $                  B, LDB, A, LDA )
+*
+         ELSE IF( ITYPE.EQ.3 ) THEN
+*
+*           For B*A*x=(lambda)*x;
+*           backtransform eigenvectors: x = L*y or U**T*y
+*
+            IF( UPPER ) THEN
+               TRANS = 'T'
+            ELSE
+               TRANS = 'N'
+            END IF
+*
+            CALL DTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+     $                  B, LDB, A, LDA )
+         END IF
+      END IF
+*
+      WORK( 1 ) = LWMIN
+      RETURN
+*
+*     End of DSYGV_2STAGE
+*
+      END
diff --git a/SRC/dsytrd_2stage.f b/SRC/dsytrd_2stage.f
new file mode 100644
index 00000000..449a279e
--- /dev/null
+++ b/SRC/dsytrd_2stage.f
@@ -0,0 +1,337 @@
+*> \brief \b DSYTRD_2STAGE
+*
+*  @generated from zhetrd_2stage.f, fortran z -> d, Sun Nov  6 19:34:06 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download DSYTRD_2STAGE + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsytrd_2stage.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsytrd_2stage.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsytrd_2stage.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       CHARACTER          VECT, UPLO
+*       INTEGER            N, LDA, LWORK, LHOUS2, INFO
+*      ..
+*      .. Array Arguments ..
+*       DOUBLE PRECISION   D( * ), E( * )
+*       DOUBLE PRECISION   A( LDA, * ), TAU( * ),
+*                          HOUS2( * ), WORK( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSYTRD_2STAGE reduces a real symmetric matrix A to real symmetric
+*> tridiagonal form T by a orthogonal similarity transformation:
+*> Q1**T Q2**T* A * Q2 * Q1 = T.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] VECT
+*> \verbatim
+*>          VECT is CHARACTER*1
+*>          = 'N':  No need for the Housholder representation, 
+*>                  in particular for the second stage (Band to
+*>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
+*>          = 'V':  the Householder representation is needed to 
+*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>                  then LHOUS2 is to be queried and computed.
+*>                  (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+*>          N-by-N upper triangular part of A contains the upper
+*>          triangular part of the matrix A, and the strictly lower
+*>          triangular part of A is not referenced.  If UPLO = 'L', the
+*>          leading N-by-N lower triangular part of A contains the lower
+*>          triangular part of the matrix A, and the strictly upper
+*>          triangular part of A is not referenced.
+*>          On exit, if UPLO = 'U', the band superdiagonal
+*>          of A are overwritten by the corresponding elements of the
+*>          internal band-diagonal matrix AB, and the elements above 
+*>          the KD superdiagonal, with the array TAU, represent the orthogonal
+*>          matrix Q1 as a product of elementary reflectors; if UPLO
+*>          = 'L', the diagonal and band subdiagonal of A are over-
+*>          written by the corresponding elements of the internal band-diagonal
+*>          matrix AB, and the elements below the KD subdiagonal, with
+*>          the array TAU, represent the orthogonal matrix Q1 as a product
+*>          of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is DOUBLE PRECISION array, dimension (N)
+*>          The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*>          E is DOUBLE PRECISION array, dimension (N-1)
+*>          The off-diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (N-KD)
+*>          The scalar factors of the elementary reflectors of 
+*>          the first stage (see Further Details).
+*> \endverbatim
+*>
+*> \param[out] HOUS2
+*> \verbatim
+*>          HOUS2 is DOUBLE PRECISION array, dimension LHOUS2, that
+*>          store the Householder representation of the stage2
+*>          band to tridiagonal.
+*> \endverbatim
+*>
+*> \param[in] LHOUS2
+*> \verbatim
+*>          LHOUS2 is INTEGER
+*>          The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS2=-1,
+*>          then a query is assumed; the routine
+*>          only calculates the optimal size of the HOUS2 array, returns
+*>          this value as the first entry of the HOUS2 array, and no error
+*>          message related to LHOUS2 is issued by XERBLA.
+*>          LHOUS2 = MAX(1, dimension) where
+*>          dimension = 4*N if VECT='N'
+*>          not available now if VECT='H'
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS2=-1,
+*>          then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>          LWORK = MAX(1, dimension) where
+*>          dimension   = max(stage1,stage2) + (KD+1)*N
+*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                        + max(2*KD*KD, KD*NTHREADS) 
+*>                        + (KD+1)*N 
+*>          where KD is the blocking size of the reduction,
+*>          FACTOPTNB is the blocking used by the QR or LQ
+*>          algorithm, usually FACTOPTNB=128 is a good choice
+*>          NTHREADS is the number of threads used when
+*>          openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2016
+*
+*> \ingroup doubleSYcomputational
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+     $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          VECT, UPLO
+      INTEGER            N, LDA, LWORK, LHOUS2, INFO
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   D( * ), E( * )
+      DOUBLE PRECISION   A( LDA, * ), TAU( * ),
+     $                   HOUS2( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, UPPER, WANTQ
+      INTEGER            KD, IB, LWMIN, LHMIN, LWRK, LDAB, WPOS, ABPOS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, DSYTRD_SY2SB, DSYTRD_SB2ST
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO   = 0
+      WANTQ  = LSAME( VECT, 'V' )
+      UPPER  = LSAME( UPLO, 'U' )
+      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS2.EQ.-1 )
+*
+*     Determine the block size, the workspace size and the hous size.
+*
+      KD     = ILAENV( 17, 'DSYTRD_2STAGE', VECT, N, -1, -1, -1 )
+      IB     = ILAENV( 18, 'DSYTRD_2STAGE', VECT, N, KD, -1, -1 )
+      LHMIN  = ILAENV( 19, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+      LWMIN  = ILAENV( 20, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+*      WRITE(*,*),'DSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
+*     $            LHMIN, LWMIN
+*
+      IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      ELSE IF( LHOUS2.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -12
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         HOUS2( 1 ) = LHMIN
+         WORK( 1 )  = LWMIN
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSYTRD_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 ) THEN
+         WORK( 1 ) = 1
+         RETURN
+      END IF
+*
+*     Determine pointer position
+*
+      LDAB  = KD+1
+      LWRK  = LWORK-LDAB*N
+      ABPOS = 1
+      WPOS  = ABPOS + LDAB*N
+      CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+     $                   TAU, WORK( WPOS ), LWRK, INFO )
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSYTRD_SY2SB', -INFO )
+         RETURN
+      END IF
+      CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, 
+     $                   WORK( ABPOS ), LDAB, D, E, 
+     $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSYTRD_SB2ST', -INFO )
+         RETURN
+      END IF
+*
+*
+      HOUS2( 1 ) = LHMIN
+      WORK( 1 )  = LWMIN
+      RETURN
+*
+*     End of DSYTRD_2STAGE
+*
+      END
diff --git a/SRC/dsytrd_sb2st.F b/SRC/dsytrd_sb2st.F
new file mode 100644
index 00000000..d50debe1
--- /dev/null
+++ b/SRC/dsytrd_sb2st.F
@@ -0,0 +1,603 @@
+*> \brief \b DSBTRD
+*
+*  @generated from zhetrd_hb2st.F, fortran z -> d, Sun Nov  6 19:34:06 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DSBTRD + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsbtrd.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsbtrd.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsbtrd.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+*       #define PRECISION_REAL
+*
+*       #if defined(_OPENMP)
+*       use omp_lib
+*       #endif
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          STAGE1, UPLO, VECT
+*       INTEGER            N, KD, IB, LDAB, LHOUS, LWORK, INFO
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   D( * ), E( * )
+*       DOUBLE PRECISION   AB( LDAB, * ), HOUS( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSBTRD reduces a real symmetric band matrix A to real symmetric
+*> tridiagonal form T by a orthogonal similarity transformation:
+*> Q**T * A * Q = T.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] STAGE
+*> \verbatim
+*>          STAGE is CHARACTER*1
+*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>                  from dense to band using the dsytrd_sy2sb routine
+*>                  was not called before this routine to reproduce AB. 
+*>                  In other term this routine is called as standalone. 
+*>          = 'Y':  "Yes": to mention that the stage 1 of the 
+*>                  reduction from dense to band using the dsytrd_sy2sb 
+*>                  routine has been called to produce AB (e.g., AB is
+*>                  the output of dsytrd_sy2sb.
+*> \endverbatim
+*>
+*> \param[in] VECT
+*> \verbatim
+*>          VECT is CHARACTER*1
+*>          = 'N':  No need for the Housholder representation, 
+*>                  and thus LHOUS is of size max(1, 4*N);
+*>          = 'V':  the Householder representation is needed to 
+*>                  either generate or to apply Q later on, 
+*>                  then LHOUS is to be queried and computed.
+*>                  (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is DOUBLE PRECISION array, dimension (LDAB,N)
+*>          On entry, the upper or lower triangle of the symmetric band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>          On exit, the diagonal elements of AB are overwritten by the
+*>          diagonal elements of the tridiagonal matrix T; if KD > 0, the
+*>          elements on the first superdiagonal (if UPLO = 'U') or the
+*>          first subdiagonal (if UPLO = 'L') are overwritten by the
+*>          off-diagonal elements of T; the rest of AB is overwritten by
+*>          values generated during the reduction.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is DOUBLE PRECISION array, dimension (N)
+*>          The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*>          E is DOUBLE PRECISION array, dimension (N-1)
+*>          The off-diagonal elements of the tridiagonal matrix T:
+*>          E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
+*> \endverbatim
+*>
+*> \param[out] HOUS
+*> \verbatim
+*>          HOUS is DOUBLE PRECISION array, dimension LHOUS, that
+*>          store the Householder representation.
+*> \endverbatim
+*>
+*> \param[in] LHOUS
+*> \verbatim
+*>          LHOUS is INTEGER
+*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS=-1,
+*>          then a query is assumed; the routine
+*>          only calculates the optimal size of the HOUS array, returns
+*>          this value as the first entry of the HOUS array, and no error
+*>          message related to LHOUS is issued by XERBLA.
+*>          LHOUS = MAX(1, dimension) where
+*>          dimension = 4*N if VECT='N'
+*>          not available now if VECT='H'     
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS=-1,
+*>          then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>          LWORK = MAX(1, dimension) where
+*>          dimension   = (2KD+1)*N + KD*NTHREADS
+*>          where KD is the blocking size of the reduction,
+*>          FACTOPTNB is the blocking used by the QR or LQ
+*>          algorithm, usually FACTOPTNB=128 is a good choice
+*>          NTHREADS is the number of threads used when
+*>          openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup real16OTHERcomputational
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+     $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+#define PRECISION_REAL
+*
+#if defined(_OPENMP)
+      use omp_lib
+#endif
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          STAGE1, UPLO, VECT
+      INTEGER            N, KD, LDAB, LHOUS, LWORK, INFO
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   D( * ), E( * )
+      DOUBLE PRECISION   AB( LDAB, * ), HOUS( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   RZERO
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( RZERO = 0.0D+0,
+     $                   ZERO = 0.0D+0,
+     $                   ONE  = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+     $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
+     $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
+     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
+     $                   SIDEV, SIZETAU, LDV, LHMIN, LWMIN
+#if defined (PRECISION_COMPLEX)
+      DOUBLE PRECISION   ABSTMP
+      DOUBLE PRECISION   TMP
+#endif
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DSB2ST_KERNELS, DLACPY, DLASET
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MIN, MAX, CEILING, REAL
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV 
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Determine the minimal workspace size required.
+*     Test the input parameters
+*
+      DEBUG   = 0
+      INFO    = 0
+      AFTERS1 = LSAME( STAGE1, 'Y' )
+      WANTQ   = LSAME( VECT, 'V' )
+      UPPER   = LSAME( UPLO, 'U' )
+      LQUERY  = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 )
+*
+*     Determine the block size, the workspace size and the hous size.
+*
+      IB     = ILAENV( 18, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
+      LHMIN  = ILAENV( 19, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      LWMIN  = ILAENV( 20, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+*
+      IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( LDAB.LT.(KD+1) ) THEN
+         INFO = -7
+      ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -11
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -13
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         HOUS( 1 ) = LHMIN
+         WORK( 1 ) = LWMIN
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSYTRD_SB2ST', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 ) THEN
+         WORK( 1 ) = 1
+         RETURN
+      END IF
+*
+*     Determine pointer position
+*
+      LDV      = KD + IB
+      SIZETAU  = 2 * N
+      SIDEV    = 2 * N
+      INDTAU   = 1
+      INDV     = INDTAU + SIZETAU
+      LDA      = 2 * KD + 1
+      SIZEA    = LDA * N
+      INDA     = 1
+      INDW     = INDA + SIZEA
+      NTHREADS = 1
+      TID      = 0
+*
+      IF( UPPER ) THEN
+          APOS     = INDA + KD
+          AWPOS    = INDA
+          DPOS     = APOS + KD
+          OFDPOS   = DPOS - 1
+          ABDPOS   = KD + 1
+          ABOFDPOS = KD
+      ELSE
+          APOS     = INDA 
+          AWPOS    = INDA + KD + 1
+          DPOS     = APOS
+          OFDPOS   = DPOS + 1
+          ABDPOS   = 1
+          ABOFDPOS = 2
+
+      ENDIF
+*      
+*     Case KD=0: 
+*     The matrix is diagonal. We just copy it (convert to "real" for 
+*     real because D is double and the imaginary part should be 0) 
+*     and store it in D. A sequential code here is better or 
+*     in a parallel environment it might need two cores for D and E
+*
+      IF( KD.EQ.0 ) THEN
+          DO 30 I = 1, N
+              D( I ) = ( AB( ABDPOS, I ) )
+   30     CONTINUE
+          DO 40 I = 1, N-1
+              E( I ) = RZERO
+   40     CONTINUE
+         GOTO 200
+      END IF
+*      
+*     Case KD=1: 
+*     The matrix is already Tridiagonal. We have to make diagonal 
+*     and offdiagonal elements real, and store them in D and E.
+*     For that, for real precision just copy the diag and offdiag 
+*     to D and E while for the COMPLEX case the bulge chasing is  
+*     performed to convert the hermetian tridiagonal to symmetric 
+*     tridiagonal. A simpler coversion formula might be used, but then 
+*     updating the Q matrix will be required and based if Q is generated
+*     or not this might complicate the story. 
+*      
+C      IF( KD.EQ.1 .AND. N.GT.(KD+1) .AND. AFTERS1 ) THEN
+      IF( KD.EQ.1 ) THEN
+          DO 50 I = 1, N
+              D( I ) = ( AB( ABDPOS, I ) )
+   50     CONTINUE
+#if defined (PRECISION_COMPLEX)
+*
+*         make off-diagonal elements real and copy them to E
+*
+          IF( UPPER ) THEN
+              DO 60 I = 1, N - 1
+                  TMP = AB( ABOFDPOS, I+1 )
+                  ABSTMP = ABS( TMP )
+                  AB( ABOFDPOS, I+1 ) = ABSTMP
+                  E( I ) = ABSTMP
+                  IF( ABSTMP.NE.RZERO ) THEN
+                     TMP = TMP / ABSTMP
+                  ELSE
+                     TMP = ONE
+                  END IF
+                  IF( I.LT.N-1 )
+     $               AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP
+C                  IF( WANTZ ) THEN
+C                     CALL DSCAL( N, ( TMP ), Q( 1, I+1 ), 1 )
+C                  END IF
+   60         CONTINUE
+          ELSE
+              DO 70 I = 1, N - 1
+                 TMP = AB( ABOFDPOS, I )
+                 ABSTMP = ABS( TMP )
+                 AB( ABOFDPOS, I ) = ABSTMP
+                 E( I ) = ABSTMP
+                 IF( ABSTMP.NE.RZERO ) THEN
+                    TMP = TMP / ABSTMP
+                 ELSE
+                    TMP = ONE
+                 END IF
+                 IF( I.LT.N-1 )
+     $              AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP
+C                 IF( WANTQ ) THEN
+C                    CALL DSCAL( N, TMP, Q( 1, I+1 ), 1 )
+C                 END IF
+   70         CONTINUE
+          ENDIF
+#else
+          IF( UPPER ) THEN
+              DO 60 I = 1, N-1
+                 E( I ) = ( AB( ABOFDPOS, I+1 ) )
+   60         CONTINUE
+          ELSE
+              DO 70 I = 1, N-1
+                 E( I ) = ( AB( ABOFDPOS, I ) )
+   70         CONTINUE
+          ENDIF
+#endif
+          GOTO 200
+      END IF
+*
+*     Main code start here. 
+*     Reduce the symmetric band of A to a tridiagonal matrix.
+*
+      THGRSIZ   = N
+      GRSIZ     = 1
+      SHIFT     = 3
+      NBTILES   = CEILING( REAL(N)/REAL(KD) )
+      STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
+      THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
+*      
+      CALL DLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
+      CALL DLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
+*
+*
+*     openMP parallelisation start here
+*
+#if defined(_OPENMP)
+!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
+!$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
+!$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
+!$OMP$         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+!$OMP MASTER
+#endif
+*
+*     main bulge chasing loop
+*      
+      DO 100 THGRID = 1, THGRNB
+          STT  = (THGRID-1)*THGRSIZ+1
+          THED = MIN( (STT + THGRSIZ -1), (N-1))
+          DO 110 I = STT, N-1
+              ED = MIN( I, THED )
+              IF( STT.GT.ED ) GOTO 100
+              DO 120 M = 1, STEPERCOL
+                  ST = STT
+                  DO 130 SWEEPID = ST, ED
+                      DO 140 K = 1, GRSIZ
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+     $                           + (M-1)*GRSIZ + K
+                          IF ( MYID.EQ.1 ) THEN
+                              TTYPE = 1
+                          ELSE
+                              TTYPE = MOD( MYID, 2 ) + 2
+                          ENDIF
+
+                          IF( TTYPE.EQ.2 ) THEN
+                              COLPT      = (MYID/2)*KD + SWEEPID
+                              STIND      = COLPT-KD+1
+                              EDIND      = MIN(COLPT,N)
+                              BLKLASTIND = COLPT
+                          ELSE
+                              COLPT      = ((MYID+1)/2)*KD + SWEEPID
+                              STIND      = COLPT-KD+1
+                              EDIND      = MIN(COLPT,N)
+                              IF( ( STIND.GE.EDIND-1 ).AND.
+     $                            ( EDIND.EQ.N ) ) THEN
+                                  BLKLASTIND = N
+                              ELSE
+                                  BLKLASTIND = 0
+                              ENDIF
+                          ENDIF
+*
+*                         Call the kernel
+*                             
+#if defined(_OPENMP)
+                          IF( TTYPE.NE.1 ) THEN      
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$     DEPEND(in:WORK(MYID-1))
+!$OMP$     DEPEND(out:WORK(MYID))
+                              TID      = OMP_GET_THREAD_NUM()
+                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+     $                             STIND, EDIND, SWEEPID, N, KD, IB,
+     $                             WORK ( INDA ), LDA, 
+     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
+     $                             WORK( INDW + TID*KD ) )
+!$OMP END TASK
+                          ELSE
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$     DEPEND(out:WORK(MYID))
+                              TID      = OMP_GET_THREAD_NUM()
+                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+     $                             STIND, EDIND, SWEEPID, N, KD, IB,
+     $                             WORK ( INDA ), LDA, 
+     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
+     $                             WORK( INDW + TID*KD ) )
+!$OMP END TASK
+                          ENDIF
+#else
+                          CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+     $                         STIND, EDIND, SWEEPID, N, KD, IB,
+     $                         WORK ( INDA ), LDA, 
+     $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
+     $                         WORK( INDW + TID*KD ) )
+#endif 
+                          IF ( BLKLASTIND.GE.(N-1) ) THEN
+                              STT = STT + 1
+                              GOTO 130
+                          ENDIF
+  140                 CONTINUE
+  130             CONTINUE
+  120         CONTINUE
+  110     CONTINUE
+  100 CONTINUE
+*
+#if defined(_OPENMP)
+!$OMP END MASTER
+!$OMP END PARALLEL
+#endif
+*      
+*     Copy the diagonal from A to D. Note that D is REAL thus only
+*     the Real part is needed, the imaginary part should be zero.
+*
+      DO 150 I = 1, N
+          D( I ) = ( WORK( DPOS+(I-1)*LDA ) )
+  150 CONTINUE
+*      
+*     Copy the off diagonal from A to E. Note that E is REAL thus only
+*     the Real part is needed, the imaginary part should be zero.
+*
+      IF( UPPER ) THEN
+          DO 160 I = 1, N-1
+             E( I ) = ( WORK( OFDPOS+I*LDA ) )
+  160     CONTINUE
+      ELSE
+          DO 170 I = 1, N-1
+             E( I ) = ( WORK( OFDPOS+(I-1)*LDA ) )
+  170     CONTINUE
+      ENDIF
+*
+  200 CONTINUE  
+*
+      HOUS( 1 ) = LHMIN
+      WORK( 1 ) = LWMIN
+      RETURN
+*
+*     End of DSYTRD_SB2ST
+*
+      END
+#undef PRECISION_REAL
+      
diff --git a/SRC/dsytrd_sy2sb.f b/SRC/dsytrd_sy2sb.f
new file mode 100644
index 00000000..8f0261df
--- /dev/null
+++ b/SRC/dsytrd_sy2sb.f
@@ -0,0 +1,517 @@
+*> \brief \b DSYTRD_SY2SB
+*
+*  @generated from zhetrd_he2hb.f, fortran z -> d, Sun Nov  6 19:34:06 2016
+*      
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download DSYTRD_SY2SB + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsytrd.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsytrd.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsytrd.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DSYTRD_SY2SB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, 
+*                              WORK, LWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          UPLO
+*       INTEGER            INFO, LDA, LDAB, LWORK, N, KD
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * ), AB( LDAB, * ), 
+*                          TAU( * ), WORK( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSYTRD_SY2SB reduces a real symmetric matrix A to real symmetric
+*> band-diagonal form AB by a orthogonal similarity transformation:
+*> Q**T * A * Q = AB.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the reduced matrix if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*>          The reduced matrix is stored in the array AB.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+*>          N-by-N upper triangular part of A contains the upper
+*>          triangular part of the matrix A, and the strictly lower
+*>          triangular part of A is not referenced.  If UPLO = 'L', the
+*>          leading N-by-N lower triangular part of A contains the lower
+*>          triangular part of the matrix A, and the strictly upper
+*>          triangular part of A is not referenced.
+*>          On exit, if UPLO = 'U', the diagonal and first superdiagonal
+*>          of A are overwritten by the corresponding elements of the
+*>          tridiagonal matrix T, and the elements above the first
+*>          superdiagonal, with the array TAU, represent the orthogonal
+*>          matrix Q as a product of elementary reflectors; if UPLO
+*>          = 'L', the diagonal and first subdiagonal of A are over-
+*>          written by the corresponding elements of the tridiagonal
+*>          matrix T, and the elements below the first subdiagonal, with
+*>          the array TAU, represent the orthogonal matrix Q as a product
+*>          of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] AB
+*> \verbatim
+*>          AB is DOUBLE PRECISION array, dimension (LDAB,N)
+*>          On exit, the upper or lower triangle of the symmetric band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (N-KD)
+*>          The scalar factors of the elementary reflectors (see Further
+*>          Details).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension LWORK.
+*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK(1) returns the size of LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK which should be calculated
+*           by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>          LWORK_QUERY = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD
+*>          where FACTOPTNB is the blocking used by the QR or LQ
+*>          algorithm, usually FACTOPTNB=128 is a good choice otherwise
+*>          putting LWORK=-1 will provide the size of WORK.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2016
+*
+*> \ingroup doubleSYcomputational
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*> \verbatim
+*>
+*>  If UPLO = 'U', the matrix Q is represented as a product of elementary
+*>  reflectors
+*>
+*>     Q = H(k)**T . . . H(2)**T H(1)**T, where k = n-kd.
+*>
+*>  Each H(i) has the form
+*>
+*>     H(i) = I - tau * v * v**T
+*>
+*>  where tau is a real scalar, and v is a real vector with
+*>  v(1:i+kd-1) = 0 and v(i+kd) = 1; conjg(v(i+kd+1:n)) is stored on exit in
+*>  A(i,i+kd+1:n), and tau in TAU(i).
+*>
+*>  If UPLO = 'L', the matrix Q is represented as a product of elementary
+*>  reflectors
+*>
+*>     Q = H(1) H(2) . . . H(k), where k = n-kd.
+*>
+*>  Each H(i) has the form
+*>
+*>     H(i) = I - tau * v * v**T
+*>
+*>  where tau is a real scalar, and v is a real vector with
+*>  v(kd+1:i) = 0 and v(i+kd+1) = 1; v(i+kd+2:n) is stored on exit in
+*   A(i+kd+2:n,i), and tau in TAU(i).
+*>
+*>  The contents of A on exit are illustrated by the following examples
+*>  with n = 5:
+*>
+*>  if UPLO = 'U':                       if UPLO = 'L':
+*>
+*>    (  ab  ab/v1  v1      v1     v1    )              (  ab                            )
+*>    (      ab     ab/v2   v2     v2    )              (  ab/v1  ab                     )
+*>    (             ab      ab/v3  v3    )              (  v1     ab/v2  ab              )
+*>    (                     ab     ab/v4 )              (  v1     v2     ab/v3  ab       )
+*>    (                            ab    )              (  v1     v2     v3     ab/v4 ab )
+*>
+*>  where d and e denote diagonal and off-diagonal elements of T, and vi
+*>  denotes an element of the vector defining H(i).
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DSYTRD_SY2SB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, 
+     $                         WORK, LWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          UPLO
+      INTEGER            INFO, LDA, LDAB, LWORK, N, KD
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), AB( LDAB, * ), 
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   RONE
+      DOUBLE PRECISION   ZERO, ONE, HALF
+      PARAMETER          ( RONE = 1.0D+0,
+     $                   ZERO = 0.0D+0,
+     $                   ONE = 1.0D+0,
+     $                   HALF = 0.5D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, UPPER
+      INTEGER            I, J, IINFO, LWMIN, PN, PK, LK,
+     $                   LDT, LDW, LDS2, LDS1, 
+     $                   LS2, LS1, LW, LT,
+     $                   TPOS, WPOS, S2POS, S1POS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, DSYR2K, DSYMM, DGEMM,
+     $                   DLARFT, DGELQF, DGEQRF, DLASET
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MIN, MAX
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV 
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Determine the minimal workspace size required 
+*     and test the input parameters
+*
+      INFO   = 0
+      UPPER  = LSAME( UPLO, 'U' )
+      LQUERY = ( LWORK.EQ.-1 )
+      LWMIN  = ILAENV( 20, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
+      
+      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      ELSE IF( LDAB.LT.MAX( 1, KD+1 ) ) THEN
+         INFO = -7
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DSYTRD_SY2SB', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         WORK( 1 ) = LWMIN
+         RETURN
+      END IF
+*
+*     Quick return if possible        
+*     Copy the upper/lower portion of A into AB 
+*
+      IF( N.LE.KD+1 ) THEN
+          IF( UPPER ) THEN
+              DO 100 I = 1, N
+                  LK = MIN( KD+1, I )
+                  CALL DCOPY( LK, A( I-LK+1, I ), 1, 
+     $                            AB( KD+1-LK+1, I ), 1 )
+  100         CONTINUE
+          ELSE
+              DO 110 I = 1, N
+                  LK = MIN( KD+1, N-I+1 )
+                  CALL DCOPY( LK, A( I, I ), 1, AB( 1, I ), 1 )
+  110         CONTINUE
+          ENDIF
+          WORK( 1 ) = 1
+          RETURN
+      END IF
+*
+*     Determine the pointer position for the workspace
+*      
+      LDT    = KD
+      LDS1   = KD
+      LT     = LDT*KD
+      LW     = N*KD
+      LS1    = LDS1*KD
+      LS2    = LWMIN - LT - LW - LS1
+*      LS2 = N*MAX(KD,FACTOPTNB) 
+      TPOS   = 1
+      WPOS   = TPOS  + LT
+      S1POS  = WPOS  + LW
+      S2POS  = S1POS + LS1 
+      IF( UPPER ) THEN
+          LDW    = KD
+          LDS2   = KD
+      ELSE
+          LDW    = N
+          LDS2   = N
+      ENDIF
+*
+*
+*     Set the workspace of the triangular matrix T to zero once such a
+*     way everytime T is generated the upper/lower portion will be always zero  
+*   
+      CALL DLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT )
+*
+      IF( UPPER ) THEN
+          DO 10 I = 1, N - KD, KD
+             PN = N-I-KD+1
+             PK = MIN( N-I-KD+1, KD )
+*        
+*            Compute the LQ factorization of the current block
+*        
+             CALL DGELQF( KD, PN, A( I, I+KD ), LDA,
+     $                    TAU( I ), WORK( S2POS ), LS2, IINFO )
+*        
+*            Copy the upper portion of A into AB
+*        
+             DO 20 J = I, I+PK-1
+                LK = MIN( KD, N-J ) + 1
+                CALL DCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+   20        CONTINUE
+*                
+             CALL DLASET( 'Lower', PK, PK, ZERO, ONE, 
+     $                    A( I, I+KD ), LDA )
+*        
+*            Form the matrix T
+*        
+             CALL DLARFT( 'Forward', 'Rowwise', PN, PK,
+     $                    A( I, I+KD ), LDA, TAU( I ), 
+     $                    WORK( TPOS ), LDT )
+*        
+*            Compute W:
+*             
+             CALL DGEMM( 'Conjugate', 'No transpose', PK, PN, PK,
+     $                   ONE,  WORK( TPOS ), LDT,
+     $                         A( I, I+KD ), LDA,
+     $                   ZERO, WORK( S2POS ), LDS2 )
+*        
+             CALL DSYMM( 'Right', UPLO, PK, PN,
+     $                   ONE,  A( I+KD, I+KD ), LDA,
+     $                         WORK( S2POS ), LDS2,
+     $                   ZERO, WORK( WPOS ), LDW )
+*        
+             CALL DGEMM( 'No transpose', 'Conjugate', PK, PK, PN,
+     $                   ONE,  WORK( WPOS ), LDW,
+     $                         WORK( S2POS ), LDS2,
+     $                   ZERO, WORK( S1POS ), LDS1 )
+*        
+             CALL DGEMM( 'No transpose', 'No transpose', PK, PN, PK,
+     $                   -HALF, WORK( S1POS ), LDS1, 
+     $                          A( I, I+KD ), LDA,
+     $                   ONE,   WORK( WPOS ), LDW )
+*             
+*        
+*            Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+*            an update of the form:  A := A - V'*W - W'*V
+*        
+             CALL DSYR2K( UPLO, 'Conjugate', PN, PK,
+     $                    -ONE, A( I, I+KD ), LDA,
+     $                          WORK( WPOS ), LDW,
+     $                    RONE, A( I+KD, I+KD ), LDA )
+   10     CONTINUE
+*
+*        Copy the upper band to AB which is the band storage matrix
+*
+         DO 30 J = N-KD+1, N
+            LK = MIN(KD, N-J) + 1
+            CALL DCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+   30    CONTINUE
+*
+      ELSE
+*
+*         Reduce the lower triangle of A to lower band matrix
+*        
+          DO 40 I = 1, N - KD, KD
+             PN = N-I-KD+1
+             PK = MIN( N-I-KD+1, KD )
+*        
+*            Compute the QR factorization of the current block
+*        
+             CALL DGEQRF( PN, KD, A( I+KD, I ), LDA,
+     $                    TAU( I ), WORK( S2POS ), LS2, IINFO )
+*        
+*            Copy the upper portion of A into AB 
+*        
+             DO 50 J = I, I+PK-1
+                LK = MIN( KD, N-J ) + 1
+                CALL DCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+   50        CONTINUE
+*                
+             CALL DLASET( 'Upper', PK, PK, ZERO, ONE, 
+     $                    A( I+KD, I ), LDA )
+*        
+*            Form the matrix T
+*        
+             CALL DLARFT( 'Forward', 'Columnwise', PN, PK,
+     $                    A( I+KD, I ), LDA, TAU( I ), 
+     $                    WORK( TPOS ), LDT )
+*        
+*            Compute W:
+*             
+             CALL DGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+     $                   ONE, A( I+KD, I ), LDA,
+     $                         WORK( TPOS ), LDT,
+     $                   ZERO, WORK( S2POS ), LDS2 )
+*        
+             CALL DSYMM( 'Left', UPLO, PN, PK,
+     $                   ONE, A( I+KD, I+KD ), LDA,
+     $                         WORK( S2POS ), LDS2,
+     $                   ZERO, WORK( WPOS ), LDW )
+*        
+             CALL DGEMM( 'Conjugate', 'No transpose', PK, PK, PN,
+     $                   ONE, WORK( S2POS ), LDS2,
+     $                         WORK( WPOS ), LDW,
+     $                   ZERO, WORK( S1POS ), LDS1 )
+*        
+             CALL DGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+     $                   -HALF, A( I+KD, I ), LDA,
+     $                         WORK( S1POS ), LDS1,
+     $                   ONE, WORK( WPOS ), LDW )
+*             
+*        
+*            Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+*            an update of the form:  A := A - V*W' - W*V'
+*        
+             CALL DSYR2K( UPLO, 'No transpose', PN, PK,
+     $                    -ONE, A( I+KD, I ), LDA,
+     $                           WORK( WPOS ), LDW,
+     $                    RONE, A( I+KD, I+KD ), LDA )
+*            ==================================================================
+*            RESTORE A FOR COMPARISON AND CHECKING TO BE REMOVED
+*             DO 45 J = I, I+PK-1
+*                LK = MIN( KD, N-J ) + 1
+*                CALL DCOPY( LK, AB( 1, J ), 1, A( J, J ), 1 )
+*   45        CONTINUE
+*            ==================================================================
+   40     CONTINUE
+*
+*        Copy the lower band to AB which is the band storage matrix
+*
+         DO 60 J = N-KD+1, N
+            LK = MIN(KD, N-J) + 1
+            CALL DCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+   60    CONTINUE
+
+      END IF
+*
+      WORK( 1 ) = LWMIN
+      RETURN
+*
+*     End of DSYTRD_SY2SB
+*
+      END
diff --git a/SRC/ilaenv.f b/SRC/ilaenv.f
index 42a380cf..c66f1679 100644
--- a/SRC/ilaenv.f
+++ b/SRC/ilaenv.f
@@ -189,7 +189,8 @@
 *     .. Executable Statements ..
 *
       GO TO ( 10, 10, 10, 80, 90, 100, 110, 120,
-     $        130, 140, 150, 160, 160, 160, 160, 160 )ISPEC
+     $        130, 140, 150, 160, 160, 160, 160, 160,
+     $        170, 170, 170, 170, 170 )ISPEC
 *
 *     Invalid value for ISPEC
 *
@@ -690,6 +691,13 @@
       ILAENV = IPARMQ( ISPEC, NAME, OPTS, N1, N2, N3, N4 )
       RETURN
 *
+  170 CONTINUE
+*
+*     17 <= ISPEC <= 21: 2stage eigenvalues and SVD or related subroutines.
+*
+      ILAENV = IPARAM2STAGE( ISPEC, NAME, OPTS, N1, N2, N3, N4 )
+      RETURN
+*
 *     End of ILAENV
 *
       END
diff --git a/SRC/iparam2stage.F b/SRC/iparam2stage.F
new file mode 100644
index 00000000..6443f16e
--- /dev/null
+++ b/SRC/iparam2stage.F
@@ -0,0 +1,388 @@
+*> \brief \b IPARAM2STAGE
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download IPARAM2STAGE + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/iparam2stage.F"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/iparam2stage.F"> 
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/iparam2stage.F"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       INTEGER FUNCTION IPARAM2STAGE( ISPEC, NAME, OPTS, 
+*                                    NI, NBI, IBI, NXI )
+*       #if defined(_OPENMP)
+*           use omp_lib
+*       #endif
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER*( * )    NAME, OPTS
+*       INTEGER            ISPEC, NI, NBI, IBI, NXI
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>      This program sets problem and machine dependent parameters
+*>      useful for xHETRD_2STAGE, xHETRD_H@2HB, xHETRD_HB2ST,
+*>      xGEBRD_2STAGE, xGEBRD_GE2GB, xGEBRD_GB2BD 
+*>      and related subroutines for eigenvalue problems. 
+*>      It is called whenever ILAENV is called with 17 <= ISPEC <= 21
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] ISPEC
+*> \verbatim
+*>          ISPEC is integer scalar
+*>              ISPEC specifies which tunable parameter IPARAM2STAGE should
+*>              return.
+*>
+*>              ISPEC=17: the optimal blocksize nb for the reduction to
+*                         BAND
+*>
+*>              ISPEC=18: the optimal blocksize ib for the eigenvectors
+*>                        singular vectors update routine
+*>
+*>              ISPEC=19: The length of the array that store the Housholder 
+*>                        representation for the second stage 
+*>                        Band to Tridiagonal or Bidiagonal
+*>
+*>              ISPEC=20: The workspace needed for the routine in input.
+*>
+*>              ISPEC=21: For future release.
+*> \endverbatim
+*>
+*> \param[in] NAME
+*> \verbatim
+*>          NAME is character string
+*>               Name of the calling subroutine
+*> \endverbatim
+*>
+*> \param[in] OPTS
+*> \verbatim
+*>          OPTS is CHARACTER*(*)
+*>          The character options to the subroutine NAME, concatenated
+*>          into a single character string.  For example, UPLO = 'U',
+*>          TRANS = 'T', and DIAG = 'N' for a triangular routine would
+*>          be specified as OPTS = 'UTN'.
+*> \endverbatim
+*>
+*> \param[in] NI
+*> \verbatim
+*>          NI is INTEGER which is the size of the matrix
+*> \endverbatim
+*>
+*> \param[in] NBI
+*> \verbatim
+*>          NBI is INTEGER which is the used in the reduciton, 
+*           (e.g., the size of the band), needed to compute workspace
+*           and LHOUS2.
+*> \endverbatim
+*>
+*> \param[in] IBI
+*> \verbatim
+*>          IBI is INTEGER which represent the IB of the reduciton,
+*           needed to compute workspace and LHOUS2.
+*> \endverbatim
+*>
+*> \param[in] NXI
+*> \verbatim
+*>          NXI is INTEGER needed in the future release.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date June 2016
+*
+*> \ingroup auxOTHERauxiliary
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All detail are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      INTEGER FUNCTION IPARAM2STAGE( ISPEC, NAME, OPTS, 
+     $                              NI, NBI, IBI, NXI )
+#if defined(_OPENMP)
+      use omp_lib
+#endif
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER*( * )    NAME, OPTS
+      INTEGER            ISPEC, NI, NBI, IBI, NXI
+*
+*  ================================================================
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, IC, IZ, KD, IB, LHOUS, LWORK, NTHREADS,
+     $                   FACTOPTNB, QROPTNB, LQOPTNB
+      LOGICAL            RPREC, CPREC
+      CHARACTER          PREC*1, ALGO*3, STAG*5, SUBNAM*12, VECT*3
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CHAR, ICHAR, MAX
+*     ..
+*     .. External Functions ..
+      INTEGER            ILAENV
+      EXTERNAL           ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Invalid value for ISPEC
+*
+      IF( (ISPEC.LT.17).OR.(ISPEC.GT.21) ) THEN
+          IPARAM2STAGE = -1
+          RETURN
+      ENDIF
+*
+*     Get the number of threads
+*      
+      NTHREADS = 1
+#if defined(_OPENMP)
+!$OMP PARALLEL 
+      NTHREADS = OMP_GET_NUM_THREADS()
+!$OMP END PARALLEL
+#endif
+*      WRITE(*,*) 'IPARAM VOICI NTHREADS ISPEC ',NTHREADS, ISPEC
+      IF( ISPEC.EQ.19 ) GOTO 19
+*
+*     Convert NAME to upper case if the first character is lower case.
+*
+      IPARAM2STAGE = -1
+      SUBNAM = NAME
+      IC = ICHAR( SUBNAM( 1: 1 ) )
+      IZ = ICHAR( 'Z' )
+      IF( IZ.EQ.90 .OR. IZ.EQ.122 ) THEN
+*
+*        ASCII character set
+*
+         IF( IC.GE.97 .AND. IC.LE.122 ) THEN
+            SUBNAM( 1: 1 ) = CHAR( IC-32 )
+            DO 100 I = 2, 12
+               IC = ICHAR( SUBNAM( I: I ) )
+               IF( IC.GE.97 .AND. IC.LE.122 )
+     $            SUBNAM( I: I ) = CHAR( IC-32 )
+  100       CONTINUE
+         END IF
+*
+      ELSE IF( IZ.EQ.233 .OR. IZ.EQ.169 ) THEN
+*
+*        EBCDIC character set
+*
+         IF( ( IC.GE.129 .AND. IC.LE.137 ) .OR.
+     $       ( IC.GE.145 .AND. IC.LE.153 ) .OR.
+     $       ( IC.GE.162 .AND. IC.LE.169 ) ) THEN
+            SUBNAM( 1: 1 ) = CHAR( IC+64 )
+            DO 110 I = 2, 12
+               IC = ICHAR( SUBNAM( I: I ) )
+               IF( ( IC.GE.129 .AND. IC.LE.137 ) .OR.
+     $             ( IC.GE.145 .AND. IC.LE.153 ) .OR.
+     $             ( IC.GE.162 .AND. IC.LE.169 ) )SUBNAM( I:
+     $             I ) = CHAR( IC+64 )
+  110       CONTINUE
+         END IF
+*
+      ELSE IF( IZ.EQ.218 .OR. IZ.EQ.250 ) THEN
+*
+*        Prime machines:  ASCII+128
+*
+         IF( IC.GE.225 .AND. IC.LE.250 ) THEN
+            SUBNAM( 1: 1 ) = CHAR( IC-32 )
+            DO 120 I = 2, 12
+               IC = ICHAR( SUBNAM( I: I ) )
+               IF( IC.GE.225 .AND. IC.LE.250 )
+     $            SUBNAM( I: I ) = CHAR( IC-32 )
+  120       CONTINUE
+         END IF
+      END IF
+*
+      PREC  = SUBNAM( 1: 1 )
+      ALGO  = SUBNAM( 4: 6 )
+      STAG  = SUBNAM( 8:12 )
+      RPREC = PREC.EQ.'S' .OR. PREC.EQ.'D'
+      CPREC = PREC.EQ.'C' .OR. PREC.EQ.'Z'
+*
+*     Invalid value for PRECISION
+*      
+      IF( .NOT.( RPREC .OR. CPREC ) ) THEN
+          IPARAM2STAGE = -1
+          RETURN
+      ENDIF
+*      WRITE(*,*),'RPREC,CPREC ',RPREC,CPREC,
+*     $           '   ALGO ',ALGO,'    STAGE ',STAG
+*      
+      GO TO ( 17, 17, 19, 20, 21 ) ISPEC-16
+*
+   17 CONTINUE
+*
+*     ISPEC = 17, 18:  block size KD, IB
+*     Could be also dependent from N but for now it
+*     depend only on sequential or parallel
+*
+      IF( NTHREADS.GT.4 ) THEN
+          IF( CPREC ) THEN
+              KD = 128
+              IB = 32
+          ELSE
+              KD = 160
+              IB = 40
+          ENDIF
+      ELSE IF( NTHREADS.GT.1 ) THEN
+          IF( CPREC ) THEN
+              KD = 64
+              IB = 32
+          ELSE
+              KD = 64
+              IB = 32
+          ENDIF
+      ELSE
+          IF( CPREC ) THEN
+              KD = 16
+              IB = 16
+          ELSE
+              KD = 32
+              IB = 16
+          ENDIF
+      ENDIF
+      IF( ISPEC.EQ.17 ) IPARAM2STAGE = KD
+      IF( ISPEC.EQ.18 ) IPARAM2STAGE = IB
+      RETURN
+*
+   19 CONTINUE
+*
+*     ISPEC = 19:  
+*     LHOUS length of the Houselholder representation
+*     matrix (V,T) of the second stage. should be >= 1.
+*
+*     Will add the VECT OPTION HERE next release
+      VECT  = OPTS(1:1)
+      IF( VECT.EQ.'N' ) THEN
+          LHOUS = MAX( 1, 4*NI )
+      ELSE
+*         This is not correct, it need to call the ALGO and the stage2
+          LHOUS = MAX( 1, 4*NI ) + IBI
+      ENDIF
+      IF( LHOUS.GE.0 ) THEN
+          IPARAM2STAGE = LHOUS
+      ELSE
+          IPARAM2STAGE = -1
+      ENDIF
+      RETURN
+*
+   20 CONTINUE
+*
+*     ISPEC = 20: (21 for future use)  
+*     LWORK length of the workspace for 
+*     either or both stages for TRD and BRD. should be >= 1.
+*     TRD:
+*     TRD_stage 1: = LT + LW + LS1 + LS2
+*                  = LDT*KD + N*KD + N*MAX(KD,FACTOPTNB) + LDS2*KD 
+*                    where LDT=LDS2=KD
+*                  = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD
+*     TRD_stage 2: = (2NB+1)*N + KD*NTHREADS
+*     TRD_both   : = max(stage1,stage2) + AB ( AB=(KD+1)*N )
+*                  = N*KD + N*max(KD+1,FACTOPTNB) 
+*                    + max(2*KD*KD, KD*NTHREADS) 
+*                    + (KD+1)*N
+      LWORK        = -1
+      SUBNAM(1:1)  = PREC
+      SUBNAM(2:6)  = 'GEQRF'
+      QROPTNB      = ILAENV( 1, SUBNAM, ' ', NI, NBI, -1, -1 )
+      SUBNAM(2:6)  = 'GELQF'
+      LQOPTNB      = ILAENV( 1, SUBNAM, ' ', NBI, NI, -1, -1 )
+*     Could be QR or LQ for TRD and the max for BRD
+      FACTOPTNB    = MAX(QROPTNB, LQOPTNB)
+      IF( ALGO.EQ.'TRD' ) THEN
+          IF( STAG.EQ.'2STAG' ) THEN
+              LWORK = NI*NBI + NI*MAX(NBI+1,FACTOPTNB) 
+     $              + MAX(2*NBI*NBI, NBI*NTHREADS) 
+     $              + (NBI+1)*NI
+          ELSE IF( (STAG.EQ.'HE2HB').OR.(STAG.EQ.'SY2SB') ) THEN
+              LWORK = NI*NBI + NI*MAX(NBI,FACTOPTNB) + 2*NBI*NBI
+          ELSE IF( (STAG.EQ.'HB2ST').OR.(STAG.EQ.'SB2ST') ) THEN
+              LWORK = (2*NBI+1)*NI + NBI*NTHREADS
+          ENDIF
+      ELSE IF( ALGO.EQ.'BRD' ) THEN
+          IF( STAG.EQ.'2STAG' ) THEN
+              LWORK = 2*NI*NBI + NI*MAX(NBI+1,FACTOPTNB) 
+     $              + MAX(2*NBI*NBI, NBI*NTHREADS) 
+     $              + (NBI+1)*NI
+          ELSE IF( STAG.EQ.'GE2GB' ) THEN
+              LWORK = NI*NBI + NI*MAX(NBI,FACTOPTNB) + 2*NBI*NBI
+          ELSE IF( STAG.EQ.'GB2BD' ) THEN
+              LWORK = (3*NBI+1)*NI + NBI*NTHREADS
+          ENDIF
+      ENDIF
+      LWORK = MAX ( 1, LWORK )
+
+      IF( LWORK.GT.0 ) THEN
+          IPARAM2STAGE = LWORK
+      ELSE
+          IPARAM2STAGE = -1
+      ENDIF
+      RETURN
+*
+   21 CONTINUE
+*
+*     ISPEC = 21 for future use 
+      IPARAM2STAGE = NXI
+      RETURN
+*
+*     ==== End of IPARAM2STAGE ====
+*
+      END
diff --git a/SRC/slarfy.f b/SRC/slarfy.f
new file mode 100644
index 00000000..19a7fa6d
--- /dev/null
+++ b/SRC/slarfy.f
@@ -0,0 +1,161 @@
+*> \brief \b SLARFY
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          UPLO
+*       INTEGER            INCV, LDC, N
+*       REAL               TAU
+*       ..
+*       .. Array Arguments ..
+*       REAL               C( LDC, * ), V( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLARFY applies an elementary reflector, or Householder matrix, H,
+*> to an n x n symmetric matrix C, from both the left and the right.
+*>
+*> H is represented in the form
+*>
+*>    H = I - tau * v * v'
+*>
+*> where  tau  is a scalar and  v  is a vector.
+*>
+*> If  tau  is  zero, then  H  is taken to be the unit matrix.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          Specifies whether the upper or lower triangular part of the
+*>          symmetric matrix C is stored.
+*>          = 'U':  Upper triangle
+*>          = 'L':  Lower triangle
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of rows and columns of the matrix C.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is REAL array, dimension
+*>                  (1 + (N-1)*abs(INCV))
+*>          The vector v as described above.
+*> \endverbatim
+*>
+*> \param[in] INCV
+*> \verbatim
+*>          INCV is INTEGER
+*>          The increment between successive elements of v.  INCV must
+*>          not be zero.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*>          TAU is REAL
+*>          The value tau as described above.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (LDC, N)
+*>          On entry, the matrix C.
+*>          On exit, C is overwritten by H * C * H'.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C.  LDC >= max( 1, N ).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (N)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2011
+*
+*> \ingroup single_eig
+*
+*  =====================================================================
+      SUBROUTINE SLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+*  -- LAPACK test routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          UPLO
+      INTEGER            INCV, LDC, N
+      REAL               TAU
+*     ..
+*     .. Array Arguments ..
+      REAL               C( LDC, * ), V( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO, HALF
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0, HALF = 0.5E+0 )
+*     ..
+*     .. Local Scalars ..
+      REAL               ALPHA
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SAXPY, SSYMV, SSYR2
+*     ..
+*     .. External Functions ..
+      REAL               SDOT
+      EXTERNAL           SDOT
+*     ..
+*     .. Executable Statements ..
+*
+      IF( TAU.EQ.ZERO )
+     $   RETURN
+*
+*     Form  w:= C * v
+*
+      CALL SSYMV( UPLO, N, ONE, C, LDC, V, INCV, ZERO, WORK, 1 )
+*
+      ALPHA = -HALF*TAU*SDOT( N, WORK, 1, V, INCV )
+      CALL SAXPY( N, ALPHA, V, INCV, WORK, 1 )
+*
+*     C := C - v * w' - w * v'
+*
+      CALL SSYR2( UPLO, N, -TAU, V, INCV, WORK, 1, C, LDC )
+*
+      RETURN
+*
+*     End of SLARFY
+*
+      END
diff --git a/SRC/ssb2st_kernels.f b/SRC/ssb2st_kernels.f
new file mode 100644
index 00000000..60058dda
--- /dev/null
+++ b/SRC/ssb2st_kernels.f
@@ -0,0 +1,320 @@
+*> \brief \b SSB2ST_KERNELS
+*
+*  @generated from zhb2st_kernels.f, fortran z -> s, Sun Nov  6 19:34:06 2016
+*      
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download SSB2ST_KERNELS + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssb2st_kernels.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssb2st_kernels.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssb2st_kernels.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE  SSB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+*                                   ST, ED, SWEEP, N, NB, IB,
+*                                   A, LDA, V, TAU, LDVT, WORK)
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          UPLO
+*       LOGICAL            WANTZ
+*       INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * ), V( * ), 
+*                          TAU( * ), WORK( * )
+*  
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSB2ST_KERNELS is an internal routine used by the SSYTRD_SB2ST
+*> subroutine.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> @param[in] n
+*>          The order of the matrix A.
+*>
+*> @param[in] nb
+*>          The size of the band.
+*>
+*> @param[in, out] A
+*>          A pointer to the matrix A.
+*>
+*> @param[in] lda
+*>          The leading dimension of the matrix A.
+*>
+*> @param[out] V
+*>          REAL array, dimension 2*n if eigenvalues only are
+*>          requested or to be queried for vectors.
+*>
+*> @param[out] TAU
+*>          REAL array, dimension (2*n).
+*>          The scalar factors of the Householder reflectors are stored
+*>          in this array.
+*>
+*> @param[in] st
+*>          internal parameter for indices.
+*>
+*> @param[in] ed
+*>          internal parameter for indices.
+*>
+*> @param[in] sweep
+*>          internal parameter for indices.
+*>
+*> @param[in] Vblksiz
+*>          internal parameter for indices.
+*>
+*> @param[in] wantz
+*>          logical which indicate if Eigenvalue are requested or both
+*>          Eigenvalue/Eigenvectors.
+*>
+*> @param[in] work
+*>          Workspace of size nb.
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE  SSB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+     $                            ST, ED, SWEEP, N, NB, IB,
+     $                            A, LDA, V, TAU, LDVT, WORK)
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          UPLO
+      LOGICAL            WANTZ
+      INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), V( * ), 
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0,
+     $                   ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, J1, J2, LM, LN, VPOS, TAUPOS,
+     $                   DPOS, OFDPOS, AJETER 
+      REAL               CTMP 
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLARFG, SLARFX, SLARFY
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MOD
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     ..
+*     .. Executable Statements ..
+*      
+      AJETER = IB + LDVT
+      UPPER = LSAME( UPLO, 'U' )
+
+      IF( UPPER ) THEN
+          DPOS    = 2 * NB + 1
+          OFDPOS  = 2 * NB
+      ELSE
+          DPOS    = 1
+          OFDPOS  = 2
+      ENDIF
+
+*
+*     Upper case
+*  
+      IF( UPPER ) THEN
+*      
+          IF( WANTZ ) THEN
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ELSE
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ENDIF
+          GO TO ( 101, 102, 103 ) TTYPE
+*
+  101     CONTINUE
+          LM = ED - ST + 1
+*
+          V( VPOS ) = ONE
+          DO 10 I = 1, LM-1
+              V( VPOS+I )         = ( A( OFDPOS-I, ST+I ) )
+              A( OFDPOS-I, ST+I ) = ZERO  
+   10     CONTINUE
+          CTMP = ( A( OFDPOS, ST ) )
+          CALL SLARFG( LM, CTMP, V( VPOS+1 ), 1, 
+     $                                   TAU( TAUPOS ) )
+          A( OFDPOS, ST ) = CTMP
+* 
+  103     CONTINUE
+          LM = ED - ST + 1
+          CALL SLARFY( UPLO, LM, V( VPOS ), 1, ( TAU( TAUPOS ) ),
+     $                             A( DPOS, ST ), LDA-1, WORK)
+          GOTO 300
+*
+  102     CONTINUE
+          J1 = ED+1
+          J2 = MIN( ED+NB, N )
+          LN = ED-ST+1
+          LM = J2-J1+1
+          IF( LM.GT.0) THEN
+              CALL SLARFX( 'Left', LN, LM, V( VPOS ),
+     $                     ( TAU( TAUPOS ) ), A( DPOS-NB, J1 ),
+     $                     LDA-1, WORK)
+*
+              IF( WANTZ ) THEN
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ELSE
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ENDIF
+*
+              V( VPOS ) = ONE
+              DO 30 I = 1, LM-1
+                  V( VPOS+I )          = ( A( DPOS-NB-I, J1+I ) )
+                  A( DPOS-NB-I, J1+I ) = ZERO
+   30         CONTINUE
+              CTMP = ( A( DPOS-NB, J1 ) )
+              CALL SLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) )
+              A( DPOS-NB, J1 ) = CTMP
+*             
+              CALL SLARFX( 'Right', LN-1, LM, V( VPOS ), 
+     $                     TAU( TAUPOS ),
+     $                     A( DPOS-NB+1, J1 ), LDA-1, WORK)
+          ENDIF
+          GOTO 300
+*
+*     Lower case
+*  
+      ELSE
+*      
+          IF( WANTZ ) THEN
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ELSE
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ENDIF
+          GO TO ( 201, 202, 203 ) TTYPE
+*  
+  201     CONTINUE
+          LM = ED - ST + 1
+*
+          V( VPOS ) = ONE
+          DO 20 I = 1, LM-1
+              V( VPOS+I )         = A( OFDPOS+I, ST-1 )
+              A( OFDPOS+I, ST-1 ) = ZERO  
+   20     CONTINUE
+          CALL SLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, 
+     $                                   TAU( TAUPOS ) )
+*
+  203     CONTINUE
+          LM = ED - ST + 1
+*
+          CALL SLARFY( UPLO, LM, V( VPOS ), 1, ( TAU( TAUPOS ) ),
+     $                                      A( DPOS, ST ), LDA-1, WORK)
+
+          GOTO 300
+*
+  202     CONTINUE
+          J1 = ED+1
+          J2 = MIN( ED+NB, N )
+          LN = ED-ST+1
+          LM = J2-J1+1
+*
+          IF( LM.GT.0) THEN
+              CALL SLARFX( 'Right', LM, LN, V( VPOS ), 
+     $                     TAU( TAUPOS ), A( DPOS+NB, ST ),
+     $                     LDA-1, WORK)
+*
+              IF( WANTZ ) THEN
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ELSE
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ENDIF
+*              
+              V( VPOS ) = ONE
+              DO 40 I = 1, LM-1
+                  V( VPOS+I )        = A( DPOS+NB+I, ST )
+                  A( DPOS+NB+I, ST ) = ZERO
+   40         CONTINUE
+              CALL SLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, 
+     $                                    TAU( TAUPOS ) )
+*                  
+              CALL SLARFX( 'Left', LM, LN-1, V( VPOS ), 
+     $                     ( TAU( TAUPOS ) ),
+     $                     A( DPOS+NB-1, ST+1 ), LDA-1, WORK)
+
+          ENDIF
+          GOTO 300
+      ENDIF
+
+  300 CONTINUE    
+      RETURN
+*
+*     END OF SSB2ST_KERNELS
+*
+      END      
diff --git a/SRC/ssbev_2stage.f b/SRC/ssbev_2stage.f
new file mode 100644
index 00000000..821c00a3
--- /dev/null
+++ b/SRC/ssbev_2stage.f
@@ -0,0 +1,377 @@
+*> \brief <b> SSBEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+*  @generated from dsbev_2stage.f, fortran d -> s, Sat Nov  5 23:58:09 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSBEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssbev_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssbev_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssbev_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SSBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+*                                WORK, LWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, KD, LDAB, LDZ, N, LWORK
+*       ..
+*       .. Array Arguments ..
+*       REAL               AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSBEV_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a real symmetric band matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is REAL array, dimension (LDAB, N)
+*>          On entry, the upper or lower triangle of the symmetric band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*>          On exit, AB is overwritten by values generated during the
+*>          reduction to tridiagonal form.  If UPLO = 'U', the first
+*>          superdiagonal and the diagonal of the tridiagonal matrix T
+*>          are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*>          the diagonal and first subdiagonal of T are returned in the
+*>          first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is REAL array, dimension (LDZ, N)
+*>          If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*>          eigenvectors of the matrix A, with the i-th column of Z
+*>          holding the eigenvector associated with W(i).
+*>          If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension LWORK
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = (2KD+1)*N + KD*NTHREADS + N
+*>                                   where KD is the size of the band.
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, the algorithm failed to converge; i
+*>                off-diagonal elements of an intermediate tridiagonal
+*>                form did not converge to zero.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup realOTHEReigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SSBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+     $                         WORK, LWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, KD, LDAB, LDZ, N, LWORK
+*     ..
+*     .. Array Arguments ..
+      REAL               AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E0, ONE = 1.0E0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, WANTZ, LQUERY
+      INTEGER            IINFO, IMAX, INDE, INDWRK, ISCALE,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS
+      REAL               ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, SLANSB
+      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLASCL, SSCAL, SSTEQR, SSTERF, XERBLA
+     $                   SSYTRD_SB2ST 
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDAB.LT.KD+1 ) THEN
+         INFO = -6
+      ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+         INFO = -9
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            WORK( 1 ) = LWMIN
+         ELSE
+            IB    = ILAENV( 18, 'SSYTRD_SB2ST', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+            LWMIN = N + LHTRD + LWTRD
+            WORK( 1 )  = LWMIN
+         ENDIF
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -11
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSBEV_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            W( 1 ) = AB( 1, 1 )
+         ELSE
+            W( 1 ) = AB( KD+1, 1 )
+         END IF
+         IF( WANTZ )
+     $      Z( 1, 1 ) = ONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = SLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            CALL SLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         ELSE
+            CALL SLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         END IF
+      END IF
+*
+*     Call SSYTRD_SB2ST to reduce symmetric band matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDHOUS = INDE + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call SSTERF.  For eigenvectors, call SSTEQR.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL SSTERF( N, W, WORK( INDE ), INFO )
+      ELSE
+         CALL SSTEQR( JOBZ, N, W, WORK( INDE ), Z, LDZ, WORK( INDWRK ),
+     $                INFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = N
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of SSBEV_2STAGE
+*
+      END
diff --git a/SRC/ssbevd_2stage.f b/SRC/ssbevd_2stage.f
new file mode 100644
index 00000000..8a306306
--- /dev/null
+++ b/SRC/ssbevd_2stage.f
@@ -0,0 +1,412 @@
+*> \brief <b> SSBEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+*  @generated from dsbevd_2stage.f, fortran d -> s, Sat Nov  5 23:58:03 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSBEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssbevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssbevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssbevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SSBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+*                                 WORK, LWORK, IWORK, LIWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, KD, LDAB, LDZ, LIWORK, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IWORK( * )
+*       REAL               AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSBEVD_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a real symmetric band matrix A using the 2stage technique for
+*> the reduction to tridiagonal. If eigenvectors are desired, it uses
+*> a divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is REAL array, dimension (LDAB, N)
+*>          On entry, the upper or lower triangle of the symmetric band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*>          On exit, AB is overwritten by values generated during the
+*>          reduction to tridiagonal form.  If UPLO = 'U', the first
+*>          superdiagonal and the diagonal of the tridiagonal matrix T
+*>          are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*>          the diagonal and first subdiagonal of T are returned in the
+*>          first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is REAL array, dimension (LDZ, N)
+*>          If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*>          eigenvectors of the matrix A, with the i-th column of Z
+*>          holding the eigenvector associated with W(i).
+*>          If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension LWORK
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = (2KD+1)*N + KD*NTHREADS + N
+*>                                   where KD is the size of the band.
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK and IWORK
+*>          arrays, returns these values as the first entries of the WORK
+*>          and IWORK arrays, and no error message related to LWORK or
+*>          LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.
+*>          If JOBZ  = 'N' or N <= 1, LIWORK must be at least 1.
+*>          If JOBZ  = 'V' and N > 2, LIWORK must be at least 3 + 5*N.
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK and IWORK arrays, and no error message related to
+*>          LWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, the algorithm failed to converge; i
+*>                off-diagonal elements of an intermediate tridiagonal
+*>                form did not converge to zero.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup realOTHEReigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SSBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+     $                          WORK, LWORK, IWORK, LIWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, KD, LDAB, LDZ, LIWORK, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      REAL               AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, LQUERY, WANTZ
+      INTEGER            IINFO, INDE, INDWK2, INDWRK, ISCALE, LIWMIN,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS,
+     $                   LLWRK2
+      REAL               ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, SLANSB
+      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGEMM, SLACPY, SLASCL, SSCAL, SSTEDC,
+     $                   SSTERF, XERBLA, SSYTRD_SB2ST
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( N.LE.1 ) THEN
+         LIWMIN = 1
+         LWMIN = 1
+      ELSE
+         IB    = ILAENV( 18, 'SSYTRD_SB2ST', JOBZ, N, KD, -1, -1 )
+         LHTRD = ILAENV( 19, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+         LWTRD = ILAENV( 20, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+         IF( WANTZ ) THEN
+            LIWMIN = 3 + 5*N
+            LWMIN = 1 + 5*N + 2*N**2
+         ELSE
+            LIWMIN = 1
+            LWMIN = MAX( 2*N, N+LHTRD+LWTRD )
+         END IF
+      END IF
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDAB.LT.KD+1 ) THEN
+         INFO = -6
+      ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+         INFO = -9
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         WORK( 1 )  = LWMIN
+         IWORK( 1 ) = LIWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -13
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSBEVD_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         W( 1 ) = AB( 1, 1 )
+         IF( WANTZ )
+     $      Z( 1, 1 ) = ONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = SLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            CALL SLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         ELSE
+            CALL SLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         END IF
+      END IF
+*
+*     Call SSYTRD_SB2ST to reduce band symmetric matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDHOUS = INDE + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+      INDWK2  = INDWRK + N*N
+      LLWRK2  = LWORK - INDWK2 + 1
+*
+      CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call SSTERF.  For eigenvectors, call SSTEDC.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL SSTERF( N, W, WORK( INDE ), INFO )
+      ELSE
+         CALL SSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N,
+     $                WORK( INDWK2 ), LLWRK2, IWORK, LIWORK, INFO )
+         CALL SGEMM( 'N', 'N', N, N, N, ONE, Z, LDZ, WORK( INDWRK ), N,
+     $               ZERO, WORK( INDWK2 ), N )
+         CALL SLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 )
+     $   CALL SSCAL( N, ONE / SIGMA, W, 1 )
+*
+      WORK( 1 ) = LWMIN
+      IWORK( 1 ) = LIWMIN
+      RETURN
+*
+*     End of SSBEVD_2STAGE
+*
+      END
diff --git a/SRC/ssbevx_2stage.f b/SRC/ssbevx_2stage.f
new file mode 100644
index 00000000..d3a588c4
--- /dev/null
+++ b/SRC/ssbevx_2stage.f
@@ -0,0 +1,633 @@
+*> \brief <b> SSBEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+*  @generated from dsbevx_2stage.f, fortran d -> s, Sat Nov  5 23:58:06 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSBEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssbevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssbevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssbevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SSBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, Q,
+*                                 LDQ, VL, VU, IL, IU, ABSTOL, M, W, Z,
+*                                 LDZ, WORK, LWORK, IWORK, IFAIL, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, RANGE, UPLO
+*       INTEGER            IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+*       REAL               ABSTOL, VL, VU
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IFAIL( * ), IWORK( * )
+*       REAL               AB( LDAB, * ), Q( LDQ, * ), W( * ), WORK( * ),
+*      $                   Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSBEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a real symmetric band matrix A using the 2stage technique for
+*> the reduction to tridiagonal. Eigenvalues and eigenvectors can
+*> be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*>          RANGE is CHARACTER*1
+*>          = 'A': all eigenvalues will be found;
+*>          = 'V': all eigenvalues in the half-open interval (VL,VU]
+*>                 will be found;
+*>          = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is REAL array, dimension (LDAB, N)
+*>          On entry, the upper or lower triangle of the symmetric band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*>          On exit, AB is overwritten by values generated during the
+*>          reduction to tridiagonal form.  If UPLO = 'U', the first
+*>          superdiagonal and the diagonal of the tridiagonal matrix T
+*>          are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*>          the diagonal and first subdiagonal of T are returned in the
+*>          first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] Q
+*> \verbatim
+*>          Q is REAL array, dimension (LDQ, N)
+*>          If JOBZ = 'V', the N-by-N orthogonal matrix used in the
+*>                         reduction to tridiagonal form.
+*>          If JOBZ = 'N', the array Q is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDQ
+*> \verbatim
+*>          LDQ is INTEGER
+*>          The leading dimension of the array Q.  If JOBZ = 'V', then
+*>          LDQ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*>          VL is REAL
+*>          If RANGE='V', the lower bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*>          VU is REAL
+*>          If RANGE='V', the upper bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*>          IL is INTEGER
+*>          If RANGE='I', the index of the
+*>          smallest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*>          IU is INTEGER
+*>          If RANGE='I', the index of the
+*>          largest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL
+*>          The absolute error tolerance for the eigenvalues.
+*>          An approximate eigenvalue is accepted as converged
+*>          when it is determined to lie in an interval [a,b]
+*>          of width less than or equal to
+*>
+*>                  ABSTOL + EPS *   max( |a|,|b| ) ,
+*>
+*>          where EPS is the machine precision.  If ABSTOL is less than
+*>          or equal to zero, then  EPS*|T|  will be used in its place,
+*>          where |T| is the 1-norm of the tridiagonal matrix obtained
+*>          by reducing AB to tridiagonal form.
+*>
+*>          Eigenvalues will be computed most accurately when ABSTOL is
+*>          set to twice the underflow threshold 2*SLAMCH('S'), not zero.
+*>          If this routine returns with INFO>0, indicating that some
+*>          eigenvectors did not converge, try setting ABSTOL to
+*>          2*SLAMCH('S').
+*>
+*>          See "Computing Small Singular Values of Bidiagonal Matrices
+*>          with Guaranteed High Relative Accuracy," by Demmel and
+*>          Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*>          M is INTEGER
+*>          The total number of eigenvalues found.  0 <= M <= N.
+*>          If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          The first M elements contain the selected eigenvalues in
+*>          ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is REAL array, dimension (LDZ, max(1,M))
+*>          If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*>          contain the orthonormal eigenvectors of the matrix A
+*>          corresponding to the selected eigenvalues, with the i-th
+*>          column of Z holding the eigenvector associated with W(i).
+*>          If an eigenvector fails to converge, then that column of Z
+*>          contains the latest approximation to the eigenvector, and the
+*>          index of the eigenvector is returned in IFAIL.
+*>          If JOBZ = 'N', then Z is not referenced.
+*>          Note: the user must ensure that at least max(1,M) columns are
+*>          supplied in the array Z; if RANGE = 'V', the exact value of M
+*>          is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (LWORK)
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, 7*N, dimension) where
+*>                                   dimension = (2KD+1)*N + KD*NTHREADS + 2*N
+*>                                   where KD is the size of the band.
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*>          IFAIL is INTEGER array, dimension (N)
+*>          If JOBZ = 'V', then if INFO = 0, the first M elements of
+*>          IFAIL are zero.  If INFO > 0, then IFAIL contains the
+*>          indices of the eigenvectors that failed to converge.
+*>          If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  if INFO = i, then i eigenvectors failed to converge.
+*>                Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup realOTHEReigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SSBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, Q,
+     $                          LDQ, VL, VU, IL, IU, ABSTOL, M, W, Z,
+     $                          LDZ, WORK, LWORK, IWORK, IFAIL, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, RANGE, UPLO
+      INTEGER            IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+      REAL               ABSTOL, VL, VU
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IFAIL( * ), IWORK( * )
+      REAL               AB( LDAB, * ), Q( LDQ, * ), W( * ), WORK( * ),
+     $                   Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E0, ONE = 1.0E0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            ALLEIG, INDEIG, LOWER, TEST, VALEIG, WANTZ,
+     $                   LQUERY
+      CHARACTER          ORDER
+      INTEGER            I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+     $                   INDISP, INDIWO, INDWRK, ISCALE, ITMP1, J, JJ,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS,
+     $                   NSPLIT
+      REAL               ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+     $                   SIGMA, SMLNUM, TMP1, VLL, VUU
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, SLANSB
+      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SGEMV, SLACPY, SLASCL, SSCAL,
+     $                   SSTEBZ, SSTEIN, SSTEQR, SSTERF, SSWAP, XERBLA,
+     $                   SSYTRD_SB2ST
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      ALLEIG = LSAME( RANGE, 'A' )
+      VALEIG = LSAME( RANGE, 'V' )
+      INDEIG = LSAME( RANGE, 'I' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( LDAB.LT.KD+1 ) THEN
+         INFO = -7
+      ELSE IF( WANTZ .AND. LDQ.LT.MAX( 1, N ) ) THEN
+         INFO = -9
+      ELSE
+         IF( VALEIG ) THEN
+            IF( N.GT.0 .AND. VU.LE.VL )
+     $         INFO = -11
+         ELSE IF( INDEIG ) THEN
+            IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+               INFO = -12
+            ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+               INFO = -13
+            END IF
+         END IF
+      END IF
+      IF( INFO.EQ.0 ) THEN
+         IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) )
+     $      INFO = -18
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            WORK( 1 ) = LWMIN
+         ELSE
+            IB    = ILAENV( 18, 'SSYTRD_SB2ST', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 )
+            LWMIN = 2*N + LHTRD + LWTRD
+            WORK( 1 )  = LWMIN
+         ENDIF
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -20
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSBEVX_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      M = 0
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         M = 1
+         IF( LOWER ) THEN
+            TMP1 = AB( 1, 1 )
+         ELSE
+            TMP1 = AB( KD+1, 1 )
+         END IF
+         IF( VALEIG ) THEN
+            IF( .NOT.( VL.LT.TMP1 .AND. VU.GE.TMP1 ) )
+     $         M = 0
+         END IF
+         IF( M.EQ.1 ) THEN
+            W( 1 ) = TMP1
+            IF( WANTZ )
+     $         Z( 1, 1 ) = ONE
+         END IF
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ISCALE = 0
+      ABSTLL = ABSTOL
+      IF( VALEIG ) THEN
+         VLL = VL
+         VUU = VU
+      ELSE
+         VLL = ZERO
+         VUU = ZERO
+      END IF
+      ANRM = SLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK )
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            CALL SLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         ELSE
+            CALL SLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         END IF
+         IF( ABSTOL.GT.0 )
+     $      ABSTLL = ABSTOL*SIGMA
+         IF( VALEIG ) THEN
+            VLL = VL*SIGMA
+            VUU = VU*SIGMA
+         END IF
+      END IF
+*
+*     Call SSYTRD_SB2ST to reduce symmetric band matrix to tridiagonal form.
+*
+      INDD    = 1
+      INDE    = INDD + N
+      INDHOUS = INDE + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, WORK( INDD ),
+     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     If all eigenvalues are desired and ABSTOL is less than or equal
+*     to zero, then call SSTERF or SSTEQR.  If this fails for some
+*     eigenvalue, then try SSTEBZ.
+*
+      TEST = .FALSE.
+      IF (INDEIG) THEN
+         IF (IL.EQ.1 .AND. IU.EQ.N) THEN
+            TEST = .TRUE.
+         END IF
+      END IF
+      IF ((ALLEIG .OR. TEST) .AND. (ABSTOL.LE.ZERO)) THEN
+         CALL SCOPY( N, WORK( INDD ), 1, W, 1 )
+         INDEE = INDWRK + 2*N
+         IF( .NOT.WANTZ ) THEN
+            CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+            CALL SSTERF( N, W, WORK( INDEE ), INFO )
+         ELSE
+            CALL SLACPY( 'A', N, N, Q, LDQ, Z, LDZ )
+            CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+            CALL SSTEQR( JOBZ, N, W, WORK( INDEE ), Z, LDZ,
+     $                   WORK( INDWRK ), INFO )
+            IF( INFO.EQ.0 ) THEN
+               DO 10 I = 1, N
+                  IFAIL( I ) = 0
+   10          CONTINUE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            M = N
+            GO TO 30
+         END IF
+         INFO = 0
+      END IF
+*
+*     Otherwise, call SSTEBZ and, if eigenvectors are desired, SSTEIN.
+*
+      IF( WANTZ ) THEN
+         ORDER = 'B'
+      ELSE
+         ORDER = 'E'
+      END IF
+      INDIBL = 1
+      INDISP = INDIBL + N
+      INDIWO = INDISP + N
+      CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+     $             WORK( INDD ), WORK( INDE ), M, NSPLIT, W,
+     $             IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ),
+     $             IWORK( INDIWO ), INFO )
+*
+      IF( WANTZ ) THEN
+         CALL SSTEIN( N, WORK( INDD ), WORK( INDE ), M, W,
+     $                IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+     $                WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO )
+*
+*        Apply orthogonal matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by SSTEIN.
+*
+         DO 20 J = 1, M
+            CALL SCOPY( N, Z( 1, J ), 1, WORK( 1 ), 1 )
+            CALL SGEMV( 'N', N, N, ONE, Q, LDQ, WORK, 1, ZERO,
+     $                  Z( 1, J ), 1 )
+   20    CONTINUE
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+   30 CONTINUE
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = M
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     If eigenvalues are not in order, then sort them, along with
+*     eigenvectors.
+*
+      IF( WANTZ ) THEN
+         DO 50 J = 1, M - 1
+            I = 0
+            TMP1 = W( J )
+            DO 40 JJ = J + 1, M
+               IF( W( JJ ).LT.TMP1 ) THEN
+                  I = JJ
+                  TMP1 = W( JJ )
+               END IF
+   40       CONTINUE
+*
+            IF( I.NE.0 ) THEN
+               ITMP1 = IWORK( INDIBL+I-1 )
+               W( I ) = W( J )
+               IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+               W( J ) = TMP1
+               IWORK( INDIBL+J-1 ) = ITMP1
+               CALL SSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+               IF( INFO.NE.0 ) THEN
+                  ITMP1 = IFAIL( I )
+                  IFAIL( I ) = IFAIL( J )
+                  IFAIL( J ) = ITMP1
+               END IF
+            END IF
+   50    CONTINUE
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of SSBEVX_2STAGE
+*
+      END
diff --git a/SRC/ssyev_2stage.f b/SRC/ssyev_2stage.f
new file mode 100644
index 00000000..52f11c35
--- /dev/null
+++ b/SRC/ssyev_2stage.f
@@ -0,0 +1,348 @@
+*> \brief <b> SSYEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+*  @generated from dsyev_2stage.f, fortran d -> s, Sat Nov  5 23:55:51 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSYEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssyevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssyevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssyevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+*                                INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, LDA, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * ), W( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSYEV_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA, N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*>          orthonormal eigenvectors of the matrix A.
+*>          If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*>          or the upper triangle (if UPLO='U') of A, including the
+*>          diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension LWORK
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + 2*N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, the algorithm failed to converge; i
+*>                off-diagonal elements of an intermediate tridiagonal
+*>                form did not converge to zero.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup realSYeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+     $                         INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, LDA, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), W( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E0, ONE = 1.0E0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, LQUERY, WANTZ
+      INTEGER            IINFO, IMAX, INDE, INDTAU, INDWRK, ISCALE,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+      REAL               ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, SLANSY
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLASCL, SORGTR, SSCAL, SSTEQR, SSTERF,
+     $                   XERBLA, SSYTRD_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         KD    = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+         IB    = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+         LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWMIN = 2*N + LHTRD + LWTRD
+         WORK( 1 )  = LWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -8
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSYEV_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 ) THEN
+         RETURN
+      END IF
+*
+      IF( N.EQ.1 ) THEN
+         W( 1 ) = A( 1, 1 )
+         WORK( 1 ) = 2
+         IF( WANTZ )
+     $      A( 1, 1 ) = ONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = SLANSY( 'M', UPLO, N, A, LDA, WORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 )
+     $   CALL SLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+*     Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDTAU  = INDE + N
+      INDHOUS = INDTAU + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call SSTERF.  For eigenvectors, first call
+*     SORGTR to generate the orthogonal matrix, then call SSTEQR.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL SSTERF( N, W, WORK( INDE ), INFO )
+      ELSE
+*        Not available in this release, and agrument checking should not
+*        let it getting here
+         RETURN
+         CALL SORGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ),
+     $                LLWORK, IINFO )
+         CALL SSTEQR( JOBZ, N, W, WORK( INDE ), A, LDA, WORK( INDTAU ),
+     $                INFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = N
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of SSYEV_2STAGE
+*
+      END
diff --git a/SRC/ssyevd_2stage.f b/SRC/ssyevd_2stage.f
new file mode 100644
index 00000000..8510b645
--- /dev/null
+++ b/SRC/ssyevd_2stage.f
@@ -0,0 +1,406 @@
+*> \brief <b> SSYEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+*  @generated from dsyevd_2stage.f, fortran d -> s, Sat Nov  5 23:55:54 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSYEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssyevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssyevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssyevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SSYEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+*                                IWORK, LIWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, LDA, LIWORK, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IWORK( * )
+*       REAL               A( LDA, * ), W( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSYEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal. If eigenvectors are desired, it uses a
+*> divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA, N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*>          orthonormal eigenvectors of the matrix A.
+*>          If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*>          or the upper triangle (if UPLO='U') of A, including the
+*>          diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array,
+*>                                         dimension (LWORK)
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N+1
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + 2*N+1
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be at least
+*>                                                1 + 6*N + 2*N**2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK and IWORK
+*>          arrays, returns these values as the first entries of the WORK
+*>          and IWORK arrays, and no error message related to LWORK or
+*>          LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.
+*>          If N <= 1,                LIWORK must be at least 1.
+*>          If JOBZ  = 'N' and N > 1, LIWORK must be at least 1.
+*>          If JOBZ  = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK and IWORK arrays, and no error message related to
+*>          LWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i and JOBZ = 'N', then the algorithm failed
+*>                to converge; i off-diagonal elements of an intermediate
+*>                tridiagonal form did not converge to zero;
+*>                if INFO = i and JOBZ = 'V', then the algorithm failed
+*>                to compute an eigenvalue while working on the submatrix
+*>                lying in rows and columns INFO/(N+1) through
+*>                mod(INFO,N+1).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date September 2012
+*
+*> \ingroup realSYeigen
+*
+*> \par Contributors:
+*  ==================
+*>
+*> Jeff Rutter, Computer Science Division, University of California
+*> at Berkeley, USA \n
+*>  Modified by Francoise Tisseur, University of Tennessee \n
+*>  Modified description of INFO. Sven, 16 Feb 05. \n
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SSYEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+     $                          IWORK, LIWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.4.2) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     September 2012
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, LDA, LIWORK, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      REAL               A( LDA, * ), W( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+*
+      LOGICAL            LOWER, LQUERY, WANTZ
+      INTEGER            IINFO, INDE, INDTAU, INDWK2, INDWRK, ISCALE,
+     $                   LIWMIN, LLWORK, LLWRK2, LWMIN,
+     $                   LHTRD, LWTRD, KD, IB, INDHOUS
+      REAL               ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, SLANSY
+      EXTERNAL           LSAME, SLAMCH, SLANSY, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLACPY, SLASCL, SORMTR, SSCAL, SSTEDC, SSTERF,
+     $                   SSYTRD_2STAGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LIWMIN = 1
+            LWMIN = 1
+         ELSE
+            KD    = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+            IB    = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            IF( WANTZ ) THEN
+               LIWMIN = 3 + 5*N
+               LWMIN = 1 + 6*N + 2*N**2
+            ELSE
+               LIWMIN = 1
+               LWMIN = 2*N + 1 + LHTRD + LWTRD
+            END IF
+         END IF
+         WORK( 1 )  = LWMIN
+         IWORK( 1 ) = LIWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -8
+         ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -10
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSYEVD_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         W( 1 ) = A( 1, 1 )
+         IF( WANTZ )
+     $      A( 1, 1 ) = ONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = SLANSY( 'M', UPLO, N, A, LDA, WORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 )
+     $   CALL SLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+*     Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDTAU  = INDE + N
+      INDHOUS = INDTAU + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+      INDWK2  = INDWRK + N*N
+      LLWRK2  = LWORK - INDWK2 + 1
+*
+      CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call SSTERF.  For eigenvectors, first call
+*     SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
+*     tridiagonal matrix, then call SORMTR to multiply it by the
+*     Householder transformations stored in A.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL SSTERF( N, W, WORK( INDE ), INFO )
+      ELSE
+*        Not available in this release, and agrument checking should not
+*        let it getting here
+         RETURN
+         CALL SSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N,
+     $                WORK( INDWK2 ), LLWRK2, IWORK, LIWORK, INFO )
+         CALL SORMTR( 'L', UPLO, 'N', N, N, A, LDA, WORK( INDTAU ),
+     $                WORK( INDWRK ), N, WORK( INDWK2 ), LLWRK2, IINFO )
+         CALL SLACPY( 'A', N, N, WORK( INDWRK ), N, A, LDA )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 )
+     $   CALL SSCAL( N, ONE / SIGMA, W, 1 )
+*
+      WORK( 1 )  = LWMIN
+      IWORK( 1 ) = LIWMIN
+*
+      RETURN
+*
+*     End of SSYEVD_2STAGE
+*
+      END
diff --git a/SRC/ssyevr_2stage.f b/SRC/ssyevr_2stage.f
new file mode 100644
index 00000000..27b99303
--- /dev/null
+++ b/SRC/ssyevr_2stage.f
@@ -0,0 +1,745 @@
+*> \brief <b> SSYEVR_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+*  @generated from dsyevr_2stage.f, fortran d -> s, Sat Nov  5 23:50:10 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSYEVR_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssyevr_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssyevr_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssyevr_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SSYEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+*                          IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, WORK,
+*                          LWORK, IWORK, LIWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, RANGE, UPLO
+*       INTEGER            IL, INFO, IU, LDA, LDZ, LIWORK, LWORK, M, N
+*       REAL               ABSTOL, VL, VU
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            ISUPPZ( * ), IWORK( * )
+*       REAL               A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSYEVR_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  Eigenvalues and eigenvectors can be
+*> selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*>
+*> SSYEVR_2STAGE first reduces the matrix A to tridiagonal form T with a call
+*> to SSYTRD.  Then, whenever possible, SSYEVR_2STAGE calls SSTEMR to compute
+*> the eigenspectrum using Relatively Robust Representations.  SSTEMR
+*> computes eigenvalues by the dqds algorithm, while orthogonal
+*> eigenvectors are computed from various "good" L D L^T representations
+*> (also known as Relatively Robust Representations). Gram-Schmidt
+*> orthogonalization is avoided as far as possible. More specifically,
+*> the various steps of the algorithm are as follows.
+*>
+*> For each unreduced block (submatrix) of T,
+*>    (a) Compute T - sigma I  = L D L^T, so that L and D
+*>        define all the wanted eigenvalues to high relative accuracy.
+*>        This means that small relative changes in the entries of D and L
+*>        cause only small relative changes in the eigenvalues and
+*>        eigenvectors. The standard (unfactored) representation of the
+*>        tridiagonal matrix T does not have this property in general.
+*>    (b) Compute the eigenvalues to suitable accuracy.
+*>        If the eigenvectors are desired, the algorithm attains full
+*>        accuracy of the computed eigenvalues only right before
+*>        the corresponding vectors have to be computed, see steps c) and d).
+*>    (c) For each cluster of close eigenvalues, select a new
+*>        shift close to the cluster, find a new factorization, and refine
+*>        the shifted eigenvalues to suitable accuracy.
+*>    (d) For each eigenvalue with a large enough relative separation compute
+*>        the corresponding eigenvector by forming a rank revealing twisted
+*>        factorization. Go back to (c) for any clusters that remain.
+*>
+*> The desired accuracy of the output can be specified by the input
+*> parameter ABSTOL.
+*>
+*> For more details, see SSTEMR's documentation and:
+*> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+*>   to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+*>   Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+*> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+*>   Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+*>   2004.  Also LAPACK Working Note 154.
+*> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+*>   tridiagonal eigenvalue/eigenvector problem",
+*>   Computer Science Division Technical Report No. UCB/CSD-97-971,
+*>   UC Berkeley, May 1997.
+*>
+*>
+*> Note 1 : SSYEVR_2STAGE calls SSTEMR when the full spectrum is requested
+*> on machines which conform to the ieee-754 floating point standard.
+*> SSYEVR_2STAGE calls SSTEBZ and SSTEIN on non-ieee machines and
+*> when partial spectrum requests are made.
+*>
+*> Normal execution of SSTEMR may create NaNs and infinities and
+*> hence may abort due to a floating point exception in environments
+*> which do not handle NaNs and infinities in the ieee standard default
+*> manner.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*>          RANGE is CHARACTER*1
+*>          = 'A': all eigenvalues will be found.
+*>          = 'V': all eigenvalues in the half-open interval (VL,VU]
+*>                 will be found.
+*>          = 'I': the IL-th through IU-th eigenvalues will be found.
+*>          For RANGE = 'V' or 'I' and IU - IL < N - 1, SSTEBZ and
+*>          SSTEIN are called
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA, N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, the lower triangle (if UPLO='L') or the upper
+*>          triangle (if UPLO='U') of A, including the diagonal, is
+*>          destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*>          VL is REAL
+*>          If RANGE='V', the lower bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*>          VU is REAL
+*>          If RANGE='V', the upper bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*>          IL is INTEGER
+*>          If RANGE='I', the index of the
+*>          smallest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*>          IU is INTEGER
+*>          If RANGE='I', the index of the
+*>          largest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL
+*>          The absolute error tolerance for the eigenvalues.
+*>          An approximate eigenvalue is accepted as converged
+*>          when it is determined to lie in an interval [a,b]
+*>          of width less than or equal to
+*>
+*>                  ABSTOL + EPS *   max( |a|,|b| ) ,
+*>
+*>          where EPS is the machine precision.  If ABSTOL is less than
+*>          or equal to zero, then  EPS*|T|  will be used in its place,
+*>          where |T| is the 1-norm of the tridiagonal matrix obtained
+*>          by reducing A to tridiagonal form.
+*>
+*>          See "Computing Small Singular Values of Bidiagonal Matrices
+*>          with Guaranteed High Relative Accuracy," by Demmel and
+*>          Kahan, LAPACK Working Note #3.
+*>
+*>          If high relative accuracy is important, set ABSTOL to
+*>          SLAMCH( 'Safe minimum' ).  Doing so will guarantee that
+*>          eigenvalues are computed to high relative accuracy when
+*>          possible in future releases.  The current code does not
+*>          make any guarantees about high relative accuracy, but
+*>          future releases will. See J. Barlow and J. Demmel,
+*>          "Computing Accurate Eigensystems of Scaled Diagonally
+*>          Dominant Matrices", LAPACK Working Note #7, for a discussion
+*>          of which matrices define their eigenvalues to high relative
+*>          accuracy.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*>          M is INTEGER
+*>          The total number of eigenvalues found.  0 <= M <= N.
+*>          If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          The first M elements contain the selected eigenvalues in
+*>          ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is REAL array, dimension (LDZ, max(1,M))
+*>          If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*>          contain the orthonormal eigenvectors of the matrix A
+*>          corresponding to the selected eigenvalues, with the i-th
+*>          column of Z holding the eigenvector associated with W(i).
+*>          If JOBZ = 'N', then Z is not referenced.
+*>          Note: the user must ensure that at least max(1,M) columns are
+*>          supplied in the array Z; if RANGE = 'V', the exact value of M
+*>          is not known in advance and an upper bound must be used.
+*>          Supplying N columns is always safe.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] ISUPPZ
+*> \verbatim
+*>          ISUPPZ is INTEGER array, dimension ( 2*max(1,M) )
+*>          The support of the eigenvectors in Z, i.e., the indices
+*>          indicating the nonzero elements in Z. The i-th eigenvector
+*>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
+*>          ISUPPZ( 2*i ). This is an output of SSTEMR (tridiagonal
+*>          matrix). The support of the eigenvectors of A is typically 
+*>          1:N because of the orthogonal transformations applied by SORMTR.
+*>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, 26*N, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + 5*N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + 5*N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal size of the IWORK array,
+*>          returns this value as the first entry of the IWORK array, and
+*>          no error message related to LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  Internal error
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup realSYeigen
+*
+*> \par Contributors:
+*  ==================
+*>
+*>     Inderjit Dhillon, IBM Almaden, USA \n
+*>     Osni Marques, LBNL/NERSC, USA \n
+*>     Ken Stanley, Computer Science Division, University of
+*>       California at Berkeley, USA \n
+*>     Jason Riedy, Computer Science Division, University of
+*>       California at Berkeley, USA \n
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SSYEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+     $                   IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, WORK,
+     $                   LWORK, IWORK, LIWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, RANGE, UPLO
+      INTEGER            IL, INFO, IU, LDA, LDZ, LIWORK, LWORK, M, N
+      REAL               ABSTOL, VL, VU
+*     ..
+*     .. Array Arguments ..
+      INTEGER            ISUPPZ( * ), IWORK( * )
+      REAL               A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            ALLEIG, INDEIG, LOWER, LQUERY, VALEIG, WANTZ,
+     $                   TRYRAC, TEST
+      CHARACTER          ORDER
+      INTEGER            I, IEEEOK, IINFO, IMAX, INDD, INDDD, INDE,
+     $                   INDEE, INDIBL, INDIFL, INDISP, INDIWO, INDTAU,
+     $                   INDWK, INDWKN, ISCALE, J, JJ, LIWMIN,
+     $                   LLWORK, LLWRKN, LWMIN, NSPLIT,
+     $                   LHTRD, LWTRD, KD, IB, INDHOUS
+      REAL               ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+     $                   SIGMA, SMLNUM, TMP1, VLL, VUU
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, SLANSY
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SORMTR, SSCAL, SSTEBZ, SSTEMR, SSTEIN,
+     $                   SSTERF, SSWAP, SSYTRD_2STAGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IEEEOK = ILAENV( 10, 'SSYEVR', 'N', 1, 2, 3, 4 )
+*
+      LOWER = LSAME( UPLO, 'L' )
+      WANTZ = LSAME( JOBZ, 'V' )
+      ALLEIG = LSAME( RANGE, 'A' )
+      VALEIG = LSAME( RANGE, 'V' )
+      INDEIG = LSAME( RANGE, 'I' )
+*
+      LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) )
+*
+      KD     = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+      IB     = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+      LHTRD  = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+      LWTRD  = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+      LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
+      LIWMIN = MAX( 1, 10*N )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE
+         IF( VALEIG ) THEN
+            IF( N.GT.0 .AND. VU.LE.VL )
+     $         INFO = -8
+         ELSE IF( INDEIG ) THEN
+            IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+               INFO = -9
+            ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+               INFO = -10
+            END IF
+         END IF
+      END IF
+      IF( INFO.EQ.0 ) THEN
+         IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+            INFO = -15
+         ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -18
+         ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -20
+         END IF
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+*         NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
+*         NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
+*         LWKOPT = MAX( ( NB+1 )*N, LWMIN )
+         WORK( 1 ) = LWMIN
+         IWORK( 1 ) = LIWMIN
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSYEVR_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      M = 0
+      IF( N.EQ.0 ) THEN
+         WORK( 1 ) = 1
+         RETURN
+      END IF
+*
+      IF( N.EQ.1 ) THEN
+         WORK( 1 ) = 26
+         IF( ALLEIG .OR. INDEIG ) THEN
+            M = 1
+            W( 1 ) = A( 1, 1 )
+         ELSE
+            IF( VL.LT.A( 1, 1 ) .AND. VU.GE.A( 1, 1 ) ) THEN
+               M = 1
+               W( 1 ) = A( 1, 1 )
+            END IF
+         END IF
+         IF( WANTZ ) THEN
+            Z( 1, 1 ) = ONE
+            ISUPPZ( 1 ) = 1
+            ISUPPZ( 2 ) = 1
+         END IF
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ISCALE = 0
+      ABSTLL = ABSTOL
+      IF (VALEIG) THEN
+         VLL = VL
+         VUU = VU
+      END IF
+      ANRM = SLANSY( 'M', UPLO, N, A, LDA, WORK )
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            DO 10 J = 1, N
+               CALL SSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+   10       CONTINUE
+         ELSE
+            DO 20 J = 1, N
+               CALL SSCAL( J, SIGMA, A( 1, J ), 1 )
+   20       CONTINUE
+         END IF
+         IF( ABSTOL.GT.0 )
+     $      ABSTLL = ABSTOL*SIGMA
+         IF( VALEIG ) THEN
+            VLL = VL*SIGMA
+            VUU = VU*SIGMA
+         END IF
+      END IF
+
+*     Initialize indices into workspaces.  Note: The IWORK indices are
+*     used only if SSTERF or SSTEMR fail.
+
+*     WORK(INDTAU:INDTAU+N-1) stores the scalar factors of the
+*     elementary reflectors used in SSYTRD.
+      INDTAU = 1
+*     WORK(INDD:INDD+N-1) stores the tridiagonal's diagonal entries.
+      INDD = INDTAU + N
+*     WORK(INDE:INDE+N-1) stores the off-diagonal entries of the
+*     tridiagonal matrix from SSYTRD.
+      INDE = INDD + N
+*     WORK(INDDD:INDDD+N-1) is a copy of the diagonal entries over
+*     -written by SSTEMR (the SSTERF path copies the diagonal to W).
+      INDDD = INDE + N
+*     WORK(INDEE:INDEE+N-1) is a copy of the off-diagonal entries over
+*     -written while computing the eigenvalues in SSTERF and SSTEMR.
+      INDEE = INDDD + N
+*     INDHOUS is the starting offset Householder storage of stage 2
+      INDHOUS = INDEE + N
+*     INDWK is the starting offset of the left-over workspace, and
+*     LLWORK is the remaining workspace size.
+      INDWK  = INDHOUS + LHTRD
+      LLWORK = LWORK - INDWK + 1
+
+
+*     IWORK(INDIBL:INDIBL+M-1) corresponds to IBLOCK in SSTEBZ and
+*     stores the block indices of each of the M<=N eigenvalues.
+      INDIBL = 1
+*     IWORK(INDISP:INDISP+NSPLIT-1) corresponds to ISPLIT in SSTEBZ and
+*     stores the starting and finishing indices of each block.
+      INDISP = INDIBL + N
+*     IWORK(INDIFL:INDIFL+N-1) stores the indices of eigenvectors
+*     that corresponding to eigenvectors that fail to converge in
+*     SSTEIN.  This information is discarded; if any fail, the driver
+*     returns INFO > 0.
+      INDIFL = INDISP + N
+*     INDIWO is the offset of the remaining integer workspace.
+      INDIWO = INDIFL + N
+
+*
+*     Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+*
+      CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), 
+     $                    WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
+     $                    LHTRD, WORK( INDWK ), LLWORK, IINFO )
+*
+*     If all eigenvalues are desired
+*     then call SSTERF or SSTEMR and SORMTR.
+*
+      TEST = .FALSE.
+      IF( INDEIG ) THEN
+         IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+            TEST = .TRUE.
+         END IF
+      END IF
+      IF( ( ALLEIG.OR.TEST ) .AND. ( IEEEOK.EQ.1 ) ) THEN
+         IF( .NOT.WANTZ ) THEN
+            CALL SCOPY( N, WORK( INDD ), 1, W, 1 )
+            CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+            CALL SSTERF( N, W, WORK( INDEE ), INFO )
+         ELSE
+            CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+            CALL SCOPY( N, WORK( INDD ), 1, WORK( INDDD ), 1 )
+*
+            IF (ABSTOL .LE. TWO*N*EPS) THEN
+               TRYRAC = .TRUE.
+            ELSE
+               TRYRAC = .FALSE.
+            END IF
+            CALL SSTEMR( JOBZ, 'A', N, WORK( INDDD ), WORK( INDEE ),
+     $                   VL, VU, IL, IU, M, W, Z, LDZ, N, ISUPPZ,
+     $                   TRYRAC, WORK( INDWK ), LWORK, IWORK, LIWORK,
+     $                   INFO )
+*
+*
+*
+*        Apply orthogonal matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by SSTEMR.
+*
+            IF( WANTZ .AND. INFO.EQ.0 ) THEN
+               INDWKN = INDE
+               LLWRKN = LWORK - INDWKN + 1
+               CALL SORMTR( 'L', UPLO, 'N', N, M, A, LDA,
+     $                      WORK( INDTAU ), Z, LDZ, WORK( INDWKN ),
+     $                      LLWRKN, IINFO )
+            END IF
+         END IF
+*
+*
+         IF( INFO.EQ.0 ) THEN
+*           Everything worked.  Skip SSTEBZ/SSTEIN.  IWORK(:) are
+*           undefined.
+            M = N
+            GO TO 30
+         END IF
+         INFO = 0
+      END IF
+*
+*     Otherwise, call SSTEBZ and, if eigenvectors are desired, SSTEIN.
+*     Also call SSTEBZ and SSTEIN if SSTEMR fails.
+*
+      IF( WANTZ ) THEN
+         ORDER = 'B'
+      ELSE
+         ORDER = 'E'
+      END IF
+
+      CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+     $             WORK( INDD ), WORK( INDE ), M, NSPLIT, W,
+     $             IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWK ),
+     $             IWORK( INDIWO ), INFO )
+*
+      IF( WANTZ ) THEN
+         CALL SSTEIN( N, WORK( INDD ), WORK( INDE ), M, W,
+     $                IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+     $                WORK( INDWK ), IWORK( INDIWO ), IWORK( INDIFL ),
+     $                INFO )
+*
+*        Apply orthogonal matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by SSTEIN.
+*
+         INDWKN = INDE
+         LLWRKN = LWORK - INDWKN + 1
+         CALL SORMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+     $                LDZ, WORK( INDWKN ), LLWRKN, IINFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+*  Jump here if SSTEMR/SSTEIN succeeded.
+   30 CONTINUE
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = M
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     If eigenvalues are not in order, then sort them, along with
+*     eigenvectors.  Note: We do not sort the IFAIL portion of IWORK.
+*     It may not be initialized (if SSTEMR/SSTEIN succeeded), and we do
+*     not return this detailed information to the user.
+*
+      IF( WANTZ ) THEN
+         DO 50 J = 1, M - 1
+            I = 0
+            TMP1 = W( J )
+            DO 40 JJ = J + 1, M
+               IF( W( JJ ).LT.TMP1 ) THEN
+                  I = JJ
+                  TMP1 = W( JJ )
+               END IF
+   40       CONTINUE
+*
+            IF( I.NE.0 ) THEN
+               W( I ) = W( J )
+               W( J ) = TMP1
+               CALL SSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+            END IF
+   50    CONTINUE
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+      IWORK( 1 ) = LIWMIN
+*
+      RETURN
+*
+*     End of SSYEVR_2STAGE
+*
+      END
diff --git a/SRC/ssyevx_2stage.f b/SRC/ssyevx_2stage.f
new file mode 100644
index 00000000..96a73ecd
--- /dev/null
+++ b/SRC/ssyevx_2stage.f
@@ -0,0 +1,608 @@
+*> \brief <b> SSYEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b>
+*
+*  @generated from dsyevx_2stage.f, fortran d -> s, Sat Nov  5 23:55:46 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSYEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssyevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssyevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssyevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SSYEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+*                                 IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+*                                 LWORK, IWORK, IFAIL, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, RANGE, UPLO
+*       INTEGER            IL, INFO, IU, LDA, LDZ, LWORK, M, N
+*       REAL               ABSTOL, VL, VU
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IFAIL( * ), IWORK( * )
+*       REAL               A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSYEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a real symmetric matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  Eigenvalues and eigenvectors can be
+*> selected by specifying either a range of values or a range of indices
+*> for the desired eigenvalues.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*>          RANGE is CHARACTER*1
+*>          = 'A': all eigenvalues will be found.
+*>          = 'V': all eigenvalues in the half-open interval (VL,VU]
+*>                 will be found.
+*>          = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA, N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, the lower triangle (if UPLO='L') or the upper
+*>          triangle (if UPLO='U') of A, including the diagonal, is
+*>          destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*>          VL is REAL
+*>          If RANGE='V', the lower bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*>          VU is REAL
+*>          If RANGE='V', the upper bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*>          IL is INTEGER
+*>          If RANGE='I', the index of the
+*>          smallest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*>          IU is INTEGER
+*>          If RANGE='I', the index of the
+*>          largest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL
+*>          The absolute error tolerance for the eigenvalues.
+*>          An approximate eigenvalue is accepted as converged
+*>          when it is determined to lie in an interval [a,b]
+*>          of width less than or equal to
+*>
+*>                  ABSTOL + EPS *   max( |a|,|b| ) ,
+*>
+*>          where EPS is the machine precision.  If ABSTOL is less than
+*>          or equal to zero, then  EPS*|T|  will be used in its place,
+*>          where |T| is the 1-norm of the tridiagonal matrix obtained
+*>          by reducing A to tridiagonal form.
+*>
+*>          Eigenvalues will be computed most accurately when ABSTOL is
+*>          set to twice the underflow threshold 2*SLAMCH('S'), not zero.
+*>          If this routine returns with INFO>0, indicating that some
+*>          eigenvectors did not converge, try setting ABSTOL to
+*>          2*SLAMCH('S').
+*>
+*>          See "Computing Small Singular Values of Bidiagonal Matrices
+*>          with Guaranteed High Relative Accuracy," by Demmel and
+*>          Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*>          M is INTEGER
+*>          The total number of eigenvalues found.  0 <= M <= N.
+*>          If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          On normal exit, the first M elements contain the selected
+*>          eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is REAL array, dimension (LDZ, max(1,M))
+*>          If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*>          contain the orthonormal eigenvectors of the matrix A
+*>          corresponding to the selected eigenvalues, with the i-th
+*>          column of Z holding the eigenvector associated with W(i).
+*>          If an eigenvector fails to converge, then that column of Z
+*>          contains the latest approximation to the eigenvector, and the
+*>          index of the eigenvector is returned in IFAIL.
+*>          If JOBZ = 'N', then Z is not referenced.
+*>          Note: the user must ensure that at least max(1,M) columns are
+*>          supplied in the array Z; if RANGE = 'V', the exact value of M
+*>          is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, 8*N, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + 3*N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + 3*N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*>          IFAIL is INTEGER array, dimension (N)
+*>          If JOBZ = 'V', then if INFO = 0, the first M elements of
+*>          IFAIL are zero.  If INFO > 0, then IFAIL contains the
+*>          indices of the eigenvectors that failed to converge.
+*>          If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, then i eigenvectors failed to converge.
+*>                Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup realSYeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SSYEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+     $                          IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+     $                          LWORK, IWORK, IFAIL, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, RANGE, UPLO
+      INTEGER            IL, INFO, IU, LDA, LDZ, LWORK, M, N
+      REAL               ABSTOL, VL, VU
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IFAIL( * ), IWORK( * )
+      REAL               A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG,
+     $                   WANTZ
+      CHARACTER          ORDER
+      INTEGER            I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+     $                   INDISP, INDIWO, INDTAU, INDWKN, INDWRK, ISCALE,
+     $                   ITMP1, J, JJ, LLWORK, LLWRKN,
+     $                   NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+      REAL               ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+     $                   SIGMA, SMLNUM, TMP1, VLL, VUU
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, SLANSY
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SLACPY, SORGTR, SORMTR, SSCAL, SSTEBZ,
+     $                   SSTEIN, SSTEQR, SSTERF, SSWAP, XERBLA,
+     $                   SSYTRD_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      LOWER = LSAME( UPLO, 'L' )
+      WANTZ = LSAME( JOBZ, 'V' )
+      ALLEIG = LSAME( RANGE, 'A' )
+      VALEIG = LSAME( RANGE, 'V' )
+      INDEIG = LSAME( RANGE, 'I' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE
+         IF( VALEIG ) THEN
+            IF( N.GT.0 .AND. VU.LE.VL )
+     $         INFO = -8
+         ELSE IF( INDEIG ) THEN
+            IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+               INFO = -9
+            ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+               INFO = -10
+            END IF
+         END IF
+      END IF
+      IF( INFO.EQ.0 ) THEN
+         IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            WORK( 1 ) = LWMIN
+         ELSE
+            KD    = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+            IB    = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            LWMIN = MAX( 8*N, 3*N + LHTRD + LWTRD )
+            WORK( 1 )  = LWMIN
+         END IF
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -17
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSYEVX_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      M = 0
+      IF( N.EQ.0 ) THEN
+         RETURN
+      END IF
+*
+      IF( N.EQ.1 ) THEN
+         IF( ALLEIG .OR. INDEIG ) THEN
+            M = 1
+            W( 1 ) = A( 1, 1 )
+         ELSE
+            IF( VL.LT.A( 1, 1 ) .AND. VU.GE.A( 1, 1 ) ) THEN
+               M = 1
+               W( 1 ) = A( 1, 1 )
+            END IF
+         END IF
+         IF( WANTZ )
+     $      Z( 1, 1 ) = ONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = SLAMCH( 'Safe minimum' )
+      EPS    = SLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ISCALE = 0
+      ABSTLL = ABSTOL
+      IF( VALEIG ) THEN
+         VLL = VL
+         VUU = VU
+      END IF
+      ANRM = SLANSY( 'M', UPLO, N, A, LDA, WORK )
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            DO 10 J = 1, N
+               CALL SSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+   10       CONTINUE
+         ELSE
+            DO 20 J = 1, N
+               CALL SSCAL( J, SIGMA, A( 1, J ), 1 )
+   20       CONTINUE
+         END IF
+         IF( ABSTOL.GT.0 )
+     $      ABSTLL = ABSTOL*SIGMA
+         IF( VALEIG ) THEN
+            VLL = VL*SIGMA
+            VUU = VU*SIGMA
+         END IF
+      END IF
+*
+*     Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
+*
+      INDTAU  = 1
+      INDE    = INDTAU + N
+      INDD    = INDE + N
+      INDHOUS = INDD + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), 
+     $                    WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
+     $                    LHTRD, WORK( INDWRK ), LLWORK, IINFO )
+*
+*     If all eigenvalues are desired and ABSTOL is less than or equal to
+*     zero, then call SSTERF or SORGTR and SSTEQR.  If this fails for
+*     some eigenvalue, then try SSTEBZ.
+*
+      TEST = .FALSE.
+      IF( INDEIG ) THEN
+         IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+            TEST = .TRUE.
+         END IF
+      END IF
+      IF( ( ALLEIG .OR. TEST ) .AND. ( ABSTOL.LE.ZERO ) ) THEN
+         CALL SCOPY( N, WORK( INDD ), 1, W, 1 )
+         INDEE = INDWRK + 2*N
+         IF( .NOT.WANTZ ) THEN
+            CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+            CALL SSTERF( N, W, WORK( INDEE ), INFO )
+         ELSE
+            CALL SLACPY( 'A', N, N, A, LDA, Z, LDZ )
+            CALL SORGTR( UPLO, N, Z, LDZ, WORK( INDTAU ),
+     $                   WORK( INDWRK ), LLWORK, IINFO )
+            CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 )
+            CALL SSTEQR( JOBZ, N, W, WORK( INDEE ), Z, LDZ,
+     $                   WORK( INDWRK ), INFO )
+            IF( INFO.EQ.0 ) THEN
+               DO 30 I = 1, N
+                  IFAIL( I ) = 0
+   30          CONTINUE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            M = N
+            GO TO 40
+         END IF
+         INFO = 0
+      END IF
+*
+*     Otherwise, call SSTEBZ and, if eigenvectors are desired, SSTEIN.
+*
+      IF( WANTZ ) THEN
+         ORDER = 'B'
+      ELSE
+         ORDER = 'E'
+      END IF
+      INDIBL = 1
+      INDISP = INDIBL + N
+      INDIWO = INDISP + N
+      CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+     $             WORK( INDD ), WORK( INDE ), M, NSPLIT, W,
+     $             IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ),
+     $             IWORK( INDIWO ), INFO )
+*
+      IF( WANTZ ) THEN
+         CALL SSTEIN( N, WORK( INDD ), WORK( INDE ), M, W,
+     $                IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+     $                WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO )
+*
+*        Apply orthogonal matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by SSTEIN.
+*
+         INDWKN = INDE
+         LLWRKN = LWORK - INDWKN + 1
+         CALL SORMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+     $                LDZ, WORK( INDWKN ), LLWRKN, IINFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+   40 CONTINUE
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = M
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     If eigenvalues are not in order, then sort them, along with
+*     eigenvectors.
+*
+      IF( WANTZ ) THEN
+         DO 60 J = 1, M - 1
+            I = 0
+            TMP1 = W( J )
+            DO 50 JJ = J + 1, M
+               IF( W( JJ ).LT.TMP1 ) THEN
+                  I = JJ
+                  TMP1 = W( JJ )
+               END IF
+   50       CONTINUE
+*
+            IF( I.NE.0 ) THEN
+               ITMP1 = IWORK( INDIBL+I-1 )
+               W( I ) = W( J )
+               IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+               W( J ) = TMP1
+               IWORK( INDIBL+J-1 ) = ITMP1
+               CALL SSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+               IF( INFO.NE.0 ) THEN
+                  ITMP1 = IFAIL( I )
+                  IFAIL( I ) = IFAIL( J )
+                  IFAIL( J ) = ITMP1
+               END IF
+            END IF
+   60    CONTINUE
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of SSYEVX_2STAGE
+*
+      END
diff --git a/SRC/ssygv_2stage.f b/SRC/ssygv_2stage.f
new file mode 100644
index 00000000..6eb172e9
--- /dev/null
+++ b/SRC/ssygv_2stage.f
@@ -0,0 +1,371 @@
+*> \brief \b SSYGV_2STAGE
+*
+*  @generated from dsygv_2stage.f, fortran d -> s, Sun Nov  6 12:54:29 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSYGV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssygv_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssygv_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssygv_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SSYGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+*                                WORK, LWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, ITYPE, LDA, LDB, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * ), B( LDB, * ), W( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSYGV_2STAGE computes all the eigenvalues, and optionally, the eigenvectors
+*> of a real generalized symmetric-definite eigenproblem, of the form
+*> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+*> Here A and B are assumed to be symmetric and B is also
+*> positive definite.
+*> This routine use the 2stage technique for the reduction to tridiagonal
+*> which showed higher performance on recent architecture and for large
+*  sizes N>2000.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] ITYPE
+*> \verbatim
+*>          ITYPE is INTEGER
+*>          Specifies the problem type to be solved:
+*>          = 1:  A*x = (lambda)*B*x
+*>          = 2:  A*B*x = (lambda)*x
+*>          = 3:  B*A*x = (lambda)*x
+*> \endverbatim
+*>
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangles of A and B are stored;
+*>          = 'L':  Lower triangles of A and B are stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrices A and B.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA, N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>
+*>          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*>          matrix Z of eigenvectors.  The eigenvectors are normalized
+*>          as follows:
+*>          if ITYPE = 1 or 2, Z**T*B*Z = I;
+*>          if ITYPE = 3, Z**T*inv(B)*Z = I.
+*>          If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
+*>          or the lower triangle (if UPLO='L') of A, including the
+*>          diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is REAL array, dimension (LDB, N)
+*>          On entry, the symmetric positive definite matrix B.
+*>          If UPLO = 'U', the leading N-by-N upper triangular part of B
+*>          contains the upper triangular part of the matrix B.
+*>          If UPLO = 'L', the leading N-by-N lower triangular part of B
+*>          contains the lower triangular part of the matrix B.
+*>
+*>          On exit, if INFO <= N, the part of B containing the matrix is
+*>          overwritten by the triangular factor U or L from the Cholesky
+*>          factorization B = U**T*U or B = L*L**T.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is REAL array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + 2*N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  SPOTRF or SSYEV returned an error code:
+*>             <= N:  if INFO = i, SSYEV failed to converge;
+*>                    i off-diagonal elements of an intermediate
+*>                    tridiagonal form did not converge to zero;
+*>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
+*>                    minor of order i of B is not positive definite.
+*>                    The factorization of B could not be completed and
+*>                    no eigenvalues or eigenvectors were computed.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup realSYeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SSYGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+     $                         WORK, LWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, ITYPE, LDA, LDB, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), B( LDB, * ), W( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE
+      PARAMETER          ( ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, UPPER, WANTZ
+      CHARACTER          TRANS
+      INTEGER            NEIG,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SPOTRF, SSYGST, STRMM, STRSM, XERBLA,
+     $                   SSYEV_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      UPPER = LSAME( UPLO, 'U' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( ITYPE.LT.1 .OR. ITYPE.GT.3 ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( UPPER .OR. LSAME( UPLO, 'L' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         KD    = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+         IB    = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+         LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWMIN = 2*N + LHTRD + LWTRD
+         WORK( 1 )  = LWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSYGV_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+*     Form a Cholesky factorization of B.
+*
+      CALL SPOTRF( UPLO, N, B, LDB, INFO )
+      IF( INFO.NE.0 ) THEN
+         INFO = N + INFO
+         RETURN
+      END IF
+*
+*     Transform problem to standard eigenvalue problem and solve.
+*
+      CALL SSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
+      CALL SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, INFO )
+*
+      IF( WANTZ ) THEN
+*
+*        Backtransform eigenvectors to the original problem.
+*
+         NEIG = N
+         IF( INFO.GT.0 )
+     $      NEIG = INFO - 1
+         IF( ITYPE.EQ.1 .OR. ITYPE.EQ.2 ) THEN
+*
+*           For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
+*           backtransform eigenvectors: x = inv(L)**T*y or inv(U)*y
+*
+            IF( UPPER ) THEN
+               TRANS = 'N'
+            ELSE
+               TRANS = 'T'
+            END IF
+*
+            CALL STRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+     $                  B, LDB, A, LDA )
+*
+         ELSE IF( ITYPE.EQ.3 ) THEN
+*
+*           For B*A*x=(lambda)*x;
+*           backtransform eigenvectors: x = L*y or U**T*y
+*
+            IF( UPPER ) THEN
+               TRANS = 'T'
+            ELSE
+               TRANS = 'N'
+            END IF
+*
+            CALL STRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+     $                  B, LDB, A, LDA )
+         END IF
+      END IF
+*
+      WORK( 1 ) = LWMIN
+      RETURN
+*
+*     End of SSYGV_2STAGE
+*
+      END
diff --git a/SRC/ssytrd_2stage.f b/SRC/ssytrd_2stage.f
new file mode 100644
index 00000000..fba3dd45
--- /dev/null
+++ b/SRC/ssytrd_2stage.f
@@ -0,0 +1,337 @@
+*> \brief \b SSYTRD_2STAGE
+*
+*  @generated from zhetrd_2stage.f, fortran z -> s, Sun Nov  6 19:34:06 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download SSYTRD_2STAGE + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd_2stage.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd_2stage.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd_2stage.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       CHARACTER          VECT, UPLO
+*       INTEGER            N, LDA, LWORK, LHOUS2, INFO
+*      ..
+*      .. Array Arguments ..
+*       REAL               D( * ), E( * )
+*       REAL               A( LDA, * ), TAU( * ),
+*                          HOUS2( * ), WORK( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSYTRD_2STAGE reduces a real symmetric matrix A to real symmetric
+*> tridiagonal form T by a orthogonal similarity transformation:
+*> Q1**T Q2**T* A * Q2 * Q1 = T.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] VECT
+*> \verbatim
+*>          VECT is CHARACTER*1
+*>          = 'N':  No need for the Housholder representation, 
+*>                  in particular for the second stage (Band to
+*>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
+*>          = 'V':  the Householder representation is needed to 
+*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>                  then LHOUS2 is to be queried and computed.
+*>                  (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+*>          N-by-N upper triangular part of A contains the upper
+*>          triangular part of the matrix A, and the strictly lower
+*>          triangular part of A is not referenced.  If UPLO = 'L', the
+*>          leading N-by-N lower triangular part of A contains the lower
+*>          triangular part of the matrix A, and the strictly upper
+*>          triangular part of A is not referenced.
+*>          On exit, if UPLO = 'U', the band superdiagonal
+*>          of A are overwritten by the corresponding elements of the
+*>          internal band-diagonal matrix AB, and the elements above 
+*>          the KD superdiagonal, with the array TAU, represent the orthogonal
+*>          matrix Q1 as a product of elementary reflectors; if UPLO
+*>          = 'L', the diagonal and band subdiagonal of A are over-
+*>          written by the corresponding elements of the internal band-diagonal
+*>          matrix AB, and the elements below the KD subdiagonal, with
+*>          the array TAU, represent the orthogonal matrix Q1 as a product
+*>          of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is REAL array, dimension (N)
+*>          The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*>          E is REAL array, dimension (N-1)
+*>          The off-diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (N-KD)
+*>          The scalar factors of the elementary reflectors of 
+*>          the first stage (see Further Details).
+*> \endverbatim
+*>
+*> \param[out] HOUS2
+*> \verbatim
+*>          HOUS2 is REAL array, dimension LHOUS2, that
+*>          store the Householder representation of the stage2
+*>          band to tridiagonal.
+*> \endverbatim
+*>
+*> \param[in] LHOUS2
+*> \verbatim
+*>          LHOUS2 is INTEGER
+*>          The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS2=-1,
+*>          then a query is assumed; the routine
+*>          only calculates the optimal size of the HOUS2 array, returns
+*>          this value as the first entry of the HOUS2 array, and no error
+*>          message related to LHOUS2 is issued by XERBLA.
+*>          LHOUS2 = MAX(1, dimension) where
+*>          dimension = 4*N if VECT='N'
+*>          not available now if VECT='H'
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS2=-1,
+*>          then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>          LWORK = MAX(1, dimension) where
+*>          dimension   = max(stage1,stage2) + (KD+1)*N
+*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                        + max(2*KD*KD, KD*NTHREADS) 
+*>                        + (KD+1)*N 
+*>          where KD is the blocking size of the reduction,
+*>          FACTOPTNB is the blocking used by the QR or LQ
+*>          algorithm, usually FACTOPTNB=128 is a good choice
+*>          NTHREADS is the number of threads used when
+*>          openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2016
+*
+*> \ingroup realSYcomputational
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+     $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          VECT, UPLO
+      INTEGER            N, LDA, LWORK, LHOUS2, INFO
+*     ..
+*     .. Array Arguments ..
+      REAL               D( * ), E( * )
+      REAL               A( LDA, * ), TAU( * ),
+     $                   HOUS2( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, UPPER, WANTQ
+      INTEGER            KD, IB, LWMIN, LHMIN, LWRK, LDAB, WPOS, ABPOS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, SSYTRD_SY2SB, SSYTRD_SB2ST
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO   = 0
+      WANTQ  = LSAME( VECT, 'V' )
+      UPPER  = LSAME( UPLO, 'U' )
+      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS2.EQ.-1 )
+*
+*     Determine the block size, the workspace size and the hous size.
+*
+      KD     = ILAENV( 17, 'SSYTRD_2STAGE', VECT, N, -1, -1, -1 )
+      IB     = ILAENV( 18, 'SSYTRD_2STAGE', VECT, N, KD, -1, -1 )
+      LHMIN  = ILAENV( 19, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+      LWMIN  = ILAENV( 20, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+*      WRITE(*,*),'SSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
+*     $            LHMIN, LWMIN
+*
+      IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      ELSE IF( LHOUS2.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -12
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         HOUS2( 1 ) = LHMIN
+         WORK( 1 )  = LWMIN
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSYTRD_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 ) THEN
+         WORK( 1 ) = 1
+         RETURN
+      END IF
+*
+*     Determine pointer position
+*
+      LDAB  = KD+1
+      LWRK  = LWORK-LDAB*N
+      ABPOS = 1
+      WPOS  = ABPOS + LDAB*N
+      CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+     $                   TAU, WORK( WPOS ), LWRK, INFO )
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSYTRD_SY2SB', -INFO )
+         RETURN
+      END IF
+      CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, 
+     $                   WORK( ABPOS ), LDAB, D, E, 
+     $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSYTRD_SB2ST', -INFO )
+         RETURN
+      END IF
+*
+*
+      HOUS2( 1 ) = LHMIN
+      WORK( 1 )  = LWMIN
+      RETURN
+*
+*     End of SSYTRD_2STAGE
+*
+      END
diff --git a/SRC/ssytrd_sb2st.F b/SRC/ssytrd_sb2st.F
new file mode 100644
index 00000000..edbcf125
--- /dev/null
+++ b/SRC/ssytrd_sb2st.F
@@ -0,0 +1,603 @@
+*> \brief \b SSBTRD
+*
+*  @generated from zhetrd_hb2st.F, fortran z -> s, Sun Nov  6 19:34:06 2016
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SSBTRD + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssbtrd.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssbtrd.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssbtrd.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+*       #define PRECISION_REAL
+*
+*       #if defined(_OPENMP)
+*       use omp_lib
+*       #endif
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          STAGE1, UPLO, VECT
+*       INTEGER            N, KD, IB, LDAB, LHOUS, LWORK, INFO
+*       ..
+*       .. Array Arguments ..
+*       REAL               D( * ), E( * )
+*       REAL               AB( LDAB, * ), HOUS( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSBTRD reduces a real symmetric band matrix A to real symmetric
+*> tridiagonal form T by a orthogonal similarity transformation:
+*> Q**T * A * Q = T.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] STAGE
+*> \verbatim
+*>          STAGE is CHARACTER*1
+*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>                  from dense to band using the ssytrd_sy2sb routine
+*>                  was not called before this routine to reproduce AB. 
+*>                  In other term this routine is called as standalone. 
+*>          = 'Y':  "Yes": to mention that the stage 1 of the 
+*>                  reduction from dense to band using the ssytrd_sy2sb 
+*>                  routine has been called to produce AB (e.g., AB is
+*>                  the output of ssytrd_sy2sb.
+*> \endverbatim
+*>
+*> \param[in] VECT
+*> \verbatim
+*>          VECT is CHARACTER*1
+*>          = 'N':  No need for the Housholder representation, 
+*>                  and thus LHOUS is of size max(1, 4*N);
+*>          = 'V':  the Householder representation is needed to 
+*>                  either generate or to apply Q later on, 
+*>                  then LHOUS is to be queried and computed.
+*>                  (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is REAL array, dimension (LDAB,N)
+*>          On entry, the upper or lower triangle of the symmetric band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>          On exit, the diagonal elements of AB are overwritten by the
+*>          diagonal elements of the tridiagonal matrix T; if KD > 0, the
+*>          elements on the first superdiagonal (if UPLO = 'U') or the
+*>          first subdiagonal (if UPLO = 'L') are overwritten by the
+*>          off-diagonal elements of T; the rest of AB is overwritten by
+*>          values generated during the reduction.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is REAL array, dimension (N)
+*>          The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*>          E is REAL array, dimension (N-1)
+*>          The off-diagonal elements of the tridiagonal matrix T:
+*>          E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
+*> \endverbatim
+*>
+*> \param[out] HOUS
+*> \verbatim
+*>          HOUS is REAL array, dimension LHOUS, that
+*>          store the Householder representation.
+*> \endverbatim
+*>
+*> \param[in] LHOUS
+*> \verbatim
+*>          LHOUS is INTEGER
+*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS=-1,
+*>          then a query is assumed; the routine
+*>          only calculates the optimal size of the HOUS array, returns
+*>          this value as the first entry of the HOUS array, and no error
+*>          message related to LHOUS is issued by XERBLA.
+*>          LHOUS = MAX(1, dimension) where
+*>          dimension = 4*N if VECT='N'
+*>          not available now if VECT='H'     
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS=-1,
+*>          then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>          LWORK = MAX(1, dimension) where
+*>          dimension   = (2KD+1)*N + KD*NTHREADS
+*>          where KD is the blocking size of the reduction,
+*>          FACTOPTNB is the blocking used by the QR or LQ
+*>          algorithm, usually FACTOPTNB=128 is a good choice
+*>          NTHREADS is the number of threads used when
+*>          openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup real16OTHERcomputational
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+     $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+#define PRECISION_REAL
+*
+#if defined(_OPENMP)
+      use omp_lib
+#endif
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          STAGE1, UPLO, VECT
+      INTEGER            N, KD, LDAB, LHOUS, LWORK, INFO
+*     ..
+*     .. Array Arguments ..
+      REAL               D( * ), E( * )
+      REAL               AB( LDAB, * ), HOUS( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               RZERO
+      REAL               ZERO, ONE
+      PARAMETER          ( RZERO = 0.0E+0,
+     $                   ZERO = 0.0E+0,
+     $                   ONE  = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+     $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
+     $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
+     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
+     $                   SISEV, SIZETAU, LDV, LHMIN, LWMIN
+#if defined (PRECISION_COMPLEX)
+      REAL               ABSTMP
+      REAL               TMP
+#endif
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SSB2ST_KERNELS, SLACPY, SLASET
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MIN, MAX, CEILING, REAL
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV 
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Determine the minimal workspace size required.
+*     Test the input parameters
+*
+      DEBUG   = 0
+      INFO    = 0
+      AFTERS1 = LSAME( STAGE1, 'Y' )
+      WANTQ   = LSAME( VECT, 'V' )
+      UPPER   = LSAME( UPLO, 'U' )
+      LQUERY  = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 )
+*
+*     Determine the block size, the workspace size and the hous size.
+*
+      IB     = ILAENV( 18, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
+      LHMIN  = ILAENV( 19, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      LWMIN  = ILAENV( 20, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+*
+      IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( LDAB.LT.(KD+1) ) THEN
+         INFO = -7
+      ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -11
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -13
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         HOUS( 1 ) = LHMIN
+         WORK( 1 ) = LWMIN
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSYTRD_SB2ST', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 ) THEN
+         WORK( 1 ) = 1
+         RETURN
+      END IF
+*
+*     Determine pointer position
+*
+      LDV      = KD + IB
+      SIZETAU  = 2 * N
+      SISEV    = 2 * N
+      INDTAU   = 1
+      INDV     = INDTAU + SIZETAU
+      LDA      = 2 * KD + 1
+      SIZEA    = LDA * N
+      INDA     = 1
+      INDW     = INDA + SIZEA
+      NTHREADS = 1
+      TID      = 0
+*
+      IF( UPPER ) THEN
+          APOS     = INDA + KD
+          AWPOS    = INDA
+          DPOS     = APOS + KD
+          OFDPOS   = DPOS - 1
+          ABDPOS   = KD + 1
+          ABOFDPOS = KD
+      ELSE
+          APOS     = INDA 
+          AWPOS    = INDA + KD + 1
+          DPOS     = APOS
+          OFDPOS   = DPOS + 1
+          ABDPOS   = 1
+          ABOFDPOS = 2
+
+      ENDIF
+*      
+*     Case KD=0: 
+*     The matrix is diagonal. We just copy it (convert to "real" for 
+*     real because D is double and the imaginary part should be 0) 
+*     and store it in D. A sequential code here is better or 
+*     in a parallel environment it might need two cores for D and E
+*
+      IF( KD.EQ.0 ) THEN
+          DO 30 I = 1, N
+              D( I ) = ( AB( ABDPOS, I ) )
+   30     CONTINUE
+          DO 40 I = 1, N-1
+              E( I ) = RZERO
+   40     CONTINUE
+         GOTO 200
+      END IF
+*      
+*     Case KD=1: 
+*     The matrix is already Tridiagonal. We have to make diagonal 
+*     and offdiagonal elements real, and store them in D and E.
+*     For that, for real precision just copy the diag and offdiag 
+*     to D and E while for the COMPLEX case the bulge chasing is  
+*     performed to convert the hermetian tridiagonal to symmetric 
+*     tridiagonal. A simpler coversion formula might be used, but then 
+*     updating the Q matrix will be required and based if Q is generated
+*     or not this might complicate the story. 
+*      
+C      IF( KD.EQ.1 .AND. N.GT.(KD+1) .AND. AFTERS1 ) THEN
+      IF( KD.EQ.1 ) THEN
+          DO 50 I = 1, N
+              D( I ) = ( AB( ABDPOS, I ) )
+   50     CONTINUE
+#if defined (PRECISION_COMPLEX)
+*
+*         make off-diagonal elements real and copy them to E
+*
+          IF( UPPER ) THEN
+              DO 60 I = 1, N - 1
+                  TMP = AB( ABOFDPOS, I+1 )
+                  ABSTMP = ABS( TMP )
+                  AB( ABOFDPOS, I+1 ) = ABSTMP
+                  E( I ) = ABSTMP
+                  IF( ABSTMP.NE.RZERO ) THEN
+                     TMP = TMP / ABSTMP
+                  ELSE
+                     TMP = ONE
+                  END IF
+                  IF( I.LT.N-1 )
+     $               AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP
+C                  IF( WANTZ ) THEN
+C                     CALL SSCAL( N, ( TMP ), Q( 1, I+1 ), 1 )
+C                  END IF
+   60         CONTINUE
+          ELSE
+              DO 70 I = 1, N - 1
+                 TMP = AB( ABOFDPOS, I )
+                 ABSTMP = ABS( TMP )
+                 AB( ABOFDPOS, I ) = ABSTMP
+                 E( I ) = ABSTMP
+                 IF( ABSTMP.NE.RZERO ) THEN
+                    TMP = TMP / ABSTMP
+                 ELSE
+                    TMP = ONE
+                 END IF
+                 IF( I.LT.N-1 )
+     $              AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP
+C                 IF( WANTQ ) THEN
+C                    CALL SSCAL( N, TMP, Q( 1, I+1 ), 1 )
+C                 END IF
+   70         CONTINUE
+          ENDIF
+#else
+          IF( UPPER ) THEN
+              DO 60 I = 1, N-1
+                 E( I ) = ( AB( ABOFDPOS, I+1 ) )
+   60         CONTINUE
+          ELSE
+              DO 70 I = 1, N-1
+                 E( I ) = ( AB( ABOFDPOS, I ) )
+   70         CONTINUE
+          ENDIF
+#endif
+          GOTO 200
+      END IF
+*
+*     Main code start here. 
+*     Reduce the symmetric band of A to a tridiagonal matrix.
+*
+      THGRSIZ   = N
+      GRSIZ     = 1
+      SHIFT     = 3
+      NBTILES   = CEILING( REAL(N)/REAL(KD) )
+      STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
+      THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
+*      
+      CALL SLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
+      CALL SLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
+*
+*
+*     openMP parallelisation start here
+*
+#if defined(_OPENMP)
+!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
+!$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
+!$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
+!$OMP$         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+!$OMP MASTER
+#endif
+*
+*     main bulge chasing loop
+*      
+      DO 100 THGRID = 1, THGRNB
+          STT  = (THGRID-1)*THGRSIZ+1
+          THED = MIN( (STT + THGRSIZ -1), (N-1))
+          DO 110 I = STT, N-1
+              ED = MIN( I, THED )
+              IF( STT.GT.ED ) GOTO 100
+              DO 120 M = 1, STEPERCOL
+                  ST = STT
+                  DO 130 SWEEPID = ST, ED
+                      DO 140 K = 1, GRSIZ
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+     $                           + (M-1)*GRSIZ + K
+                          IF ( MYID.EQ.1 ) THEN
+                              TTYPE = 1
+                          ELSE
+                              TTYPE = MOD( MYID, 2 ) + 2
+                          ENDIF
+
+                          IF( TTYPE.EQ.2 ) THEN
+                              COLPT      = (MYID/2)*KD + SWEEPID
+                              STIND      = COLPT-KD+1
+                              EDIND      = MIN(COLPT,N)
+                              BLKLASTIND = COLPT
+                          ELSE
+                              COLPT      = ((MYID+1)/2)*KD + SWEEPID
+                              STIND      = COLPT-KD+1
+                              EDIND      = MIN(COLPT,N)
+                              IF( ( STIND.GE.EDIND-1 ).AND.
+     $                            ( EDIND.EQ.N ) ) THEN
+                                  BLKLASTIND = N
+                              ELSE
+                                  BLKLASTIND = 0
+                              ENDIF
+                          ENDIF
+*
+*                         Call the kernel
+*                             
+#if defined(_OPENMP)
+                          IF( TTYPE.NE.1 ) THEN      
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$     DEPEND(in:WORK(MYID-1))
+!$OMP$     DEPEND(out:WORK(MYID))
+                              TID      = OMP_GET_THREAD_NUM()
+                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+     $                             STIND, EDIND, SWEEPID, N, KD, IB,
+     $                             WORK ( INDA ), LDA, 
+     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
+     $                             WORK( INDW + TID*KD ) )
+!$OMP END TASK
+                          ELSE
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$     DEPEND(out:WORK(MYID))
+                              TID      = OMP_GET_THREAD_NUM()
+                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+     $                             STIND, EDIND, SWEEPID, N, KD, IB,
+     $                             WORK ( INDA ), LDA, 
+     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
+     $                             WORK( INDW + TID*KD ) )
+!$OMP END TASK
+                          ENDIF
+#else
+                          CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+     $                         STIND, EDIND, SWEEPID, N, KD, IB,
+     $                         WORK ( INDA ), LDA, 
+     $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
+     $                         WORK( INDW + TID*KD ) )
+#endif 
+                          IF ( BLKLASTIND.GE.(N-1) ) THEN
+                              STT = STT + 1
+                              GOTO 130
+                          ENDIF
+  140                 CONTINUE
+  130             CONTINUE
+  120         CONTINUE
+  110     CONTINUE
+  100 CONTINUE
+*
+#if defined(_OPENMP)
+!$OMP END MASTER
+!$OMP END PARALLEL
+#endif
+*      
+*     Copy the diagonal from A to D. Note that D is REAL thus only
+*     the Real part is needed, the imaginary part should be zero.
+*
+      DO 150 I = 1, N
+          D( I ) = ( WORK( DPOS+(I-1)*LDA ) )
+  150 CONTINUE
+*      
+*     Copy the off diagonal from A to E. Note that E is REAL thus only
+*     the Real part is needed, the imaginary part should be zero.
+*
+      IF( UPPER ) THEN
+          DO 160 I = 1, N-1
+             E( I ) = ( WORK( OFDPOS+I*LDA ) )
+  160     CONTINUE
+      ELSE
+          DO 170 I = 1, N-1
+             E( I ) = ( WORK( OFDPOS+(I-1)*LDA ) )
+  170     CONTINUE
+      ENDIF
+*
+  200 CONTINUE  
+*
+      HOUS( 1 ) = LHMIN
+      WORK( 1 ) = LWMIN
+      RETURN
+*
+*     End of SSYTRD_SB2ST
+*
+      END
+#undef PRECISION_REAL
+      
diff --git a/SRC/ssytrd_sy2sb.f b/SRC/ssytrd_sy2sb.f
new file mode 100644
index 00000000..3dbbaf1f
--- /dev/null
+++ b/SRC/ssytrd_sy2sb.f
@@ -0,0 +1,517 @@
+*> \brief \b SSYTRD_SY2SB
+*
+*  @generated from zhetrd_he2hb.f, fortran z -> s, Sun Nov  6 19:34:06 2016
+*      
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download SSYTRD_SY2SB + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SSYTRD_SY2SB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, 
+*                              WORK, LWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          UPLO
+*       INTEGER            INFO, LDA, LDAB, LWORK, N, KD
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * ), AB( LDAB, * ), 
+*                          TAU( * ), WORK( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSYTRD_SY2SB reduces a real symmetric matrix A to real symmetric
+*> band-diagonal form AB by a orthogonal similarity transformation:
+*> Q**T * A * Q = AB.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the reduced matrix if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*>          The reduced matrix is stored in the array AB.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+*>          N-by-N upper triangular part of A contains the upper
+*>          triangular part of the matrix A, and the strictly lower
+*>          triangular part of A is not referenced.  If UPLO = 'L', the
+*>          leading N-by-N lower triangular part of A contains the lower
+*>          triangular part of the matrix A, and the strictly upper
+*>          triangular part of A is not referenced.
+*>          On exit, if UPLO = 'U', the diagonal and first superdiagonal
+*>          of A are overwritten by the corresponding elements of the
+*>          tridiagonal matrix T, and the elements above the first
+*>          superdiagonal, with the array TAU, represent the orthogonal
+*>          matrix Q as a product of elementary reflectors; if UPLO
+*>          = 'L', the diagonal and first subdiagonal of A are over-
+*>          written by the corresponding elements of the tridiagonal
+*>          matrix T, and the elements below the first subdiagonal, with
+*>          the array TAU, represent the orthogonal matrix Q as a product
+*>          of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] AB
+*> \verbatim
+*>          AB is REAL array, dimension (LDAB,N)
+*>          On exit, the upper or lower triangle of the symmetric band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (N-KD)
+*>          The scalar factors of the elementary reflectors (see Further
+*>          Details).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension LWORK.
+*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK(1) returns the size of LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK which should be calculated
+*           by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>          LWORK_QUERY = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD
+*>          where FACTOPTNB is the blocking used by the QR or LQ
+*>          algorithm, usually FACTOPTNB=128 is a good choice otherwise
+*>          putting LWORK=-1 will provide the size of WORK.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2016
+*
+*> \ingroup realSYcomputational
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*> \verbatim
+*>
+*>  If UPLO = 'U', the matrix Q is represented as a product of elementary
+*>  reflectors
+*>
+*>     Q = H(k)**T . . . H(2)**T H(1)**T, where k = n-kd.
+*>
+*>  Each H(i) has the form
+*>
+*>     H(i) = I - tau * v * v**T
+*>
+*>  where tau is a real scalar, and v is a real vector with
+*>  v(1:i+kd-1) = 0 and v(i+kd) = 1; conjg(v(i+kd+1:n)) is stored on exit in
+*>  A(i,i+kd+1:n), and tau in TAU(i).
+*>
+*>  If UPLO = 'L', the matrix Q is represented as a product of elementary
+*>  reflectors
+*>
+*>     Q = H(1) H(2) . . . H(k), where k = n-kd.
+*>
+*>  Each H(i) has the form
+*>
+*>     H(i) = I - tau * v * v**T
+*>
+*>  where tau is a real scalar, and v is a real vector with
+*>  v(kd+1:i) = 0 and v(i+kd+1) = 1; v(i+kd+2:n) is stored on exit in
+*   A(i+kd+2:n,i), and tau in TAU(i).
+*>
+*>  The contents of A on exit are illustrated by the following examples
+*>  with n = 5:
+*>
+*>  if UPLO = 'U':                       if UPLO = 'L':
+*>
+*>    (  ab  ab/v1  v1      v1     v1    )              (  ab                            )
+*>    (      ab     ab/v2   v2     v2    )              (  ab/v1  ab                     )
+*>    (             ab      ab/v3  v3    )              (  v1     ab/v2  ab              )
+*>    (                     ab     ab/v4 )              (  v1     v2     ab/v3  ab       )
+*>    (                            ab    )              (  v1     v2     v3     ab/v4 ab )
+*>
+*>  where d and e denote diagonal and off-diagonal elements of T, and vi
+*>  denotes an element of the vector defining H(i).
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SSYTRD_SY2SB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, 
+     $                         WORK, LWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          UPLO
+      INTEGER            INFO, LDA, LDAB, LWORK, N, KD
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), AB( LDAB, * ), 
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               RONE
+      REAL               ZERO, ONE, HALF
+      PARAMETER          ( RONE = 1.0E+0,
+     $                   ZERO = 0.0E+0,
+     $                   ONE = 1.0E+0,
+     $                   HALF = 0.5E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, UPPER
+      INTEGER            I, J, IINFO, LWMIN, PN, PK, LK,
+     $                   LDT, LDW, LDS2, LDS1, 
+     $                   LS2, LS1, LW, LT,
+     $                   TPOS, WPOS, S2POS, S1POS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, SSYR2K, SSYMM, SGEMM,
+     $                   SLARFT, SGELQF, SGEQRF, SLASET
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MIN, MAX
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV 
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Determine the minimal workspace size required 
+*     and test the input parameters
+*
+      INFO   = 0
+      UPPER  = LSAME( UPLO, 'U' )
+      LQUERY = ( LWORK.EQ.-1 )
+      LWMIN  = ILAENV( 20, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
+      
+      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      ELSE IF( LDAB.LT.MAX( 1, KD+1 ) ) THEN
+         INFO = -7
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SSYTRD_SY2SB', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         WORK( 1 ) = LWMIN
+         RETURN
+      END IF
+*
+*     Quick return if possible        
+*     Copy the upper/lower portion of A into AB 
+*
+      IF( N.LE.KD+1 ) THEN
+          IF( UPPER ) THEN
+              DO 100 I = 1, N
+                  LK = MIN( KD+1, I )
+                  CALL SCOPY( LK, A( I-LK+1, I ), 1, 
+     $                            AB( KD+1-LK+1, I ), 1 )
+  100         CONTINUE
+          ELSE
+              DO 110 I = 1, N
+                  LK = MIN( KD+1, N-I+1 )
+                  CALL SCOPY( LK, A( I, I ), 1, AB( 1, I ), 1 )
+  110         CONTINUE
+          ENDIF
+          WORK( 1 ) = 1
+          RETURN
+      END IF
+*
+*     Determine the pointer position for the workspace
+*      
+      LDT    = KD
+      LDS1   = KD
+      LT     = LDT*KD
+      LW     = N*KD
+      LS1    = LDS1*KD
+      LS2    = LWMIN - LT - LW - LS1
+*      LS2 = N*MAX(KD,FACTOPTNB) 
+      TPOS   = 1
+      WPOS   = TPOS  + LT
+      S1POS  = WPOS  + LW
+      S2POS  = S1POS + LS1 
+      IF( UPPER ) THEN
+          LDW    = KD
+          LDS2   = KD
+      ELSE
+          LDW    = N
+          LDS2   = N
+      ENDIF
+*
+*
+*     Set the workspace of the triangular matrix T to zero once such a
+*     way everytime T is generated the upper/lower portion will be always zero  
+*   
+      CALL SLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT )
+*
+      IF( UPPER ) THEN
+          DO 10 I = 1, N - KD, KD
+             PN = N-I-KD+1
+             PK = MIN( N-I-KD+1, KD )
+*        
+*            Compute the LQ factorization of the current block
+*        
+             CALL SGELQF( KD, PN, A( I, I+KD ), LDA,
+     $                    TAU( I ), WORK( S2POS ), LS2, IINFO )
+*        
+*            Copy the upper portion of A into AB
+*        
+             DO 20 J = I, I+PK-1
+                LK = MIN( KD, N-J ) + 1
+                CALL SCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+   20        CONTINUE
+*                
+             CALL SLASET( 'Lower', PK, PK, ZERO, ONE, 
+     $                    A( I, I+KD ), LDA )
+*        
+*            Form the matrix T
+*        
+             CALL SLARFT( 'Forward', 'Rowwise', PN, PK,
+     $                    A( I, I+KD ), LDA, TAU( I ), 
+     $                    WORK( TPOS ), LDT )
+*        
+*            Compute W:
+*             
+             CALL SGEMM( 'Conjugate', 'No transpose', PK, PN, PK,
+     $                   ONE,  WORK( TPOS ), LDT,
+     $                         A( I, I+KD ), LDA,
+     $                   ZERO, WORK( S2POS ), LDS2 )
+*        
+             CALL SSYMM( 'Right', UPLO, PK, PN,
+     $                   ONE,  A( I+KD, I+KD ), LDA,
+     $                         WORK( S2POS ), LDS2,
+     $                   ZERO, WORK( WPOS ), LDW )
+*        
+             CALL SGEMM( 'No transpose', 'Conjugate', PK, PK, PN,
+     $                   ONE,  WORK( WPOS ), LDW,
+     $                         WORK( S2POS ), LDS2,
+     $                   ZERO, WORK( S1POS ), LDS1 )
+*        
+             CALL SGEMM( 'No transpose', 'No transpose', PK, PN, PK,
+     $                   -HALF, WORK( S1POS ), LDS1, 
+     $                          A( I, I+KD ), LDA,
+     $                   ONE,   WORK( WPOS ), LDW )
+*             
+*        
+*            Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+*            an update of the form:  A := A - V'*W - W'*V
+*        
+             CALL SSYR2K( UPLO, 'Conjugate', PN, PK,
+     $                    -ONE, A( I, I+KD ), LDA,
+     $                          WORK( WPOS ), LDW,
+     $                    RONE, A( I+KD, I+KD ), LDA )
+   10     CONTINUE
+*
+*        Copy the upper band to AB which is the band storage matrix
+*
+         DO 30 J = N-KD+1, N
+            LK = MIN(KD, N-J) + 1
+            CALL SCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+   30    CONTINUE
+*
+      ELSE
+*
+*         Reduce the lower triangle of A to lower band matrix
+*        
+          DO 40 I = 1, N - KD, KD
+             PN = N-I-KD+1
+             PK = MIN( N-I-KD+1, KD )
+*        
+*            Compute the QR factorization of the current block
+*        
+             CALL SGEQRF( PN, KD, A( I+KD, I ), LDA,
+     $                    TAU( I ), WORK( S2POS ), LS2, IINFO )
+*        
+*            Copy the upper portion of A into AB 
+*        
+             DO 50 J = I, I+PK-1
+                LK = MIN( KD, N-J ) + 1
+                CALL SCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+   50        CONTINUE
+*                
+             CALL SLASET( 'Upper', PK, PK, ZERO, ONE, 
+     $                    A( I+KD, I ), LDA )
+*        
+*            Form the matrix T
+*        
+             CALL SLARFT( 'Forward', 'Columnwise', PN, PK,
+     $                    A( I+KD, I ), LDA, TAU( I ), 
+     $                    WORK( TPOS ), LDT )
+*        
+*            Compute W:
+*             
+             CALL SGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+     $                   ONE, A( I+KD, I ), LDA,
+     $                         WORK( TPOS ), LDT,
+     $                   ZERO, WORK( S2POS ), LDS2 )
+*        
+             CALL SSYMM( 'Left', UPLO, PN, PK,
+     $                   ONE, A( I+KD, I+KD ), LDA,
+     $                         WORK( S2POS ), LDS2,
+     $                   ZERO, WORK( WPOS ), LDW )
+*        
+             CALL SGEMM( 'Conjugate', 'No transpose', PK, PK, PN,
+     $                   ONE, WORK( S2POS ), LDS2,
+     $                         WORK( WPOS ), LDW,
+     $                   ZERO, WORK( S1POS ), LDS1 )
+*        
+             CALL SGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+     $                   -HALF, A( I+KD, I ), LDA,
+     $                         WORK( S1POS ), LDS1,
+     $                   ONE, WORK( WPOS ), LDW )
+*             
+*        
+*            Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+*            an update of the form:  A := A - V*W' - W*V'
+*        
+             CALL SSYR2K( UPLO, 'No transpose', PN, PK,
+     $                    -ONE, A( I+KD, I ), LDA,
+     $                           WORK( WPOS ), LDW,
+     $                    RONE, A( I+KD, I+KD ), LDA )
+*            ==================================================================
+*            RESTORE A FOR COMPARISON AND CHECKING TO BE REMOVED
+*             DO 45 J = I, I+PK-1
+*                LK = MIN( KD, N-J ) + 1
+*                CALL SCOPY( LK, AB( 1, J ), 1, A( J, J ), 1 )
+*   45        CONTINUE
+*            ==================================================================
+   40     CONTINUE
+*
+*        Copy the lower band to AB which is the band storage matrix
+*
+         DO 60 J = N-KD+1, N
+            LK = MIN(KD, N-J) + 1
+            CALL SCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+   60    CONTINUE
+
+      END IF
+*
+      WORK( 1 ) = LWMIN
+      RETURN
+*
+*     End of SSYTRD_SY2SB
+*
+      END
diff --git a/SRC/zhb2st_kernels.f b/SRC/zhb2st_kernels.f
new file mode 100644
index 00000000..ab03b303
--- /dev/null
+++ b/SRC/zhb2st_kernels.f
@@ -0,0 +1,320 @@
+*> \brief \b ZHB2ST_KERNELS
+*
+*  @precisions fortran z -> s d c
+*      
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ZHB2ST_KERNELS + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhb2st_kernels.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhb2st_kernels.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhb2st_kernels.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE  ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+*                                   ST, ED, SWEEP, N, NB, IB,
+*                                   A, LDA, V, TAU, LDVT, WORK)
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          UPLO
+*       LOGICAL            WANTZ
+*       INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         A( LDA, * ), V( * ), 
+*                          TAU( * ), WORK( * )
+*  
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZHB2ST_KERNELS is an internal routine used by the ZHETRD_HB2ST
+*> subroutine.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> @param[in] n
+*>          The order of the matrix A.
+*>
+*> @param[in] nb
+*>          The size of the band.
+*>
+*> @param[in, out] A
+*>          A pointer to the matrix A.
+*>
+*> @param[in] lda
+*>          The leading dimension of the matrix A.
+*>
+*> @param[out] V
+*>          COMPLEX*16 array, dimension 2*n if eigenvalues only are
+*>          requested or to be queried for vectors.
+*>
+*> @param[out] TAU
+*>          COMPLEX*16 array, dimension (2*n).
+*>          The scalar factors of the Householder reflectors are stored
+*>          in this array.
+*>
+*> @param[in] st
+*>          internal parameter for indices.
+*>
+*> @param[in] ed
+*>          internal parameter for indices.
+*>
+*> @param[in] sweep
+*>          internal parameter for indices.
+*>
+*> @param[in] Vblksiz
+*>          internal parameter for indices.
+*>
+*> @param[in] wantz
+*>          logical which indicate if Eigenvalue are requested or both
+*>          Eigenvalue/Eigenvectors.
+*>
+*> @param[in] work
+*>          Workspace of size nb.
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE  ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+     $                            ST, ED, SWEEP, N, NB, IB,
+     $                            A, LDA, V, TAU, LDVT, WORK)
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          UPLO
+      LOGICAL            WANTZ
+      INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), V( * ), 
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   ONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, J1, J2, LM, LN, VPOS, TAUPOS,
+     $                   DPOS, OFDPOS, AJETER 
+      COMPLEX*16         CTMP 
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLARFG, ZLARFX, ZLARFY
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MOD
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     ..
+*     .. Executable Statements ..
+*      
+      AJETER = IB + LDVT
+      UPPER = LSAME( UPLO, 'U' )
+
+      IF( UPPER ) THEN
+          DPOS    = 2 * NB + 1
+          OFDPOS  = 2 * NB
+      ELSE
+          DPOS    = 1
+          OFDPOS  = 2
+      ENDIF
+
+*
+*     Upper case
+*  
+      IF( UPPER ) THEN
+*      
+          IF( WANTZ ) THEN
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ELSE
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ENDIF
+          GO TO ( 101, 102, 103 ) TTYPE
+*
+  101     CONTINUE
+          LM = ED - ST + 1
+*
+          V( VPOS ) = ONE
+          DO 10 I = 1, LM-1
+              V( VPOS+I )         = DCONJG( A( OFDPOS-I, ST+I ) )
+              A( OFDPOS-I, ST+I ) = ZERO  
+   10     CONTINUE
+          CTMP = DCONJG( A( OFDPOS, ST ) )
+          CALL ZLARFG( LM, CTMP, V( VPOS+1 ), 1, 
+     $                                   TAU( TAUPOS ) )
+          A( OFDPOS, ST ) = CTMP
+* 
+  103     CONTINUE
+          LM = ED - ST + 1
+          CALL ZLARFY( UPLO, LM, V( VPOS ), 1, DCONJG( TAU( TAUPOS ) ),
+     $                             A( DPOS, ST ), LDA-1, WORK)
+          GOTO 300
+*
+  102     CONTINUE
+          J1 = ED+1
+          J2 = MIN( ED+NB, N )
+          LN = ED-ST+1
+          LM = J2-J1+1
+          IF( LM.GT.0) THEN
+              CALL ZLARFX( 'Left', LN, LM, V( VPOS ),
+     $                     DCONJG( TAU( TAUPOS ) ), A( DPOS-NB, J1 ),
+     $                     LDA-1, WORK)
+*
+              IF( WANTZ ) THEN
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ELSE
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ENDIF
+*
+              V( VPOS ) = ONE
+              DO 30 I = 1, LM-1
+                  V( VPOS+I )          = DCONJG( A( DPOS-NB-I, J1+I ) )
+                  A( DPOS-NB-I, J1+I ) = ZERO
+   30         CONTINUE
+              CTMP = DCONJG( A( DPOS-NB, J1 ) )
+              CALL ZLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) )
+              A( DPOS-NB, J1 ) = CTMP
+*             
+              CALL ZLARFX( 'Right', LN-1, LM, V( VPOS ), 
+     $                     TAU( TAUPOS ),
+     $                     A( DPOS-NB+1, J1 ), LDA-1, WORK)
+          ENDIF
+          GOTO 300
+*
+*     Lower case
+*  
+      ELSE
+*      
+          IF( WANTZ ) THEN
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ELSE
+              VPOS   = MOD( SWEEP-1, 2 ) * N + ST
+              TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
+          ENDIF
+          GO TO ( 201, 202, 203 ) TTYPE
+*  
+  201     CONTINUE
+          LM = ED - ST + 1
+*
+          V( VPOS ) = ONE
+          DO 20 I = 1, LM-1
+              V( VPOS+I )         = A( OFDPOS+I, ST-1 )
+              A( OFDPOS+I, ST-1 ) = ZERO  
+   20     CONTINUE
+          CALL ZLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, 
+     $                                   TAU( TAUPOS ) )
+*
+  203     CONTINUE
+          LM = ED - ST + 1
+*
+          CALL ZLARFY( UPLO, LM, V( VPOS ), 1, DCONJG( TAU( TAUPOS ) ),
+     $                                      A( DPOS, ST ), LDA-1, WORK)
+
+          GOTO 300
+*
+  202     CONTINUE
+          J1 = ED+1
+          J2 = MIN( ED+NB, N )
+          LN = ED-ST+1
+          LM = J2-J1+1
+*
+          IF( LM.GT.0) THEN
+              CALL ZLARFX( 'Right', LM, LN, V( VPOS ), 
+     $                     TAU( TAUPOS ), A( DPOS+NB, ST ),
+     $                     LDA-1, WORK)
+*
+              IF( WANTZ ) THEN
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ELSE
+                  VPOS   = MOD( SWEEP-1, 2 ) * N + J1
+                  TAUPOS = MOD( SWEEP-1, 2 ) * N + J1
+              ENDIF
+*              
+              V( VPOS ) = ONE
+              DO 40 I = 1, LM-1
+                  V( VPOS+I )        = A( DPOS+NB+I, ST )
+                  A( DPOS+NB+I, ST ) = ZERO
+   40         CONTINUE
+              CALL ZLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, 
+     $                                    TAU( TAUPOS ) )
+*                  
+              CALL ZLARFX( 'Left', LM, LN-1, V( VPOS ), 
+     $                     DCONJG( TAU( TAUPOS ) ),
+     $                     A( DPOS+NB-1, ST+1 ), LDA-1, WORK)
+
+          ENDIF
+          GOTO 300
+      ENDIF
+
+  300 CONTINUE    
+      RETURN
+*
+*     END OF ZHB2ST_KERNELS
+*
+      END      
diff --git a/SRC/zhbev_2stage.f b/SRC/zhbev_2stage.f
new file mode 100644
index 00000000..f1088b87
--- /dev/null
+++ b/SRC/zhbev_2stage.f
@@ -0,0 +1,386 @@
+*> \brief <b> ZHBEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+*  @precisions fortran z -> s d c
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHBEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhbev_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhbev_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhbev_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZHBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+*                                WORK, LWORK, RWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, KD, LDAB, LDZ, N, LWORK
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   RWORK( * ), W( * )
+*       COMPLEX*16         AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZHBEV_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a complex Hermitian band matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX*16 array, dimension (LDAB, N)
+*>          On entry, the upper or lower triangle of the Hermitian band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*>          On exit, AB is overwritten by values generated during the
+*>          reduction to tridiagonal form.  If UPLO = 'U', the first
+*>          superdiagonal and the diagonal of the tridiagonal matrix T
+*>          are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*>          the diagonal and first subdiagonal of T are returned in the
+*>          first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is COMPLEX*16 array, dimension (LDZ, N)
+*>          If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*>          eigenvectors of the matrix A, with the i-th column of Z
+*>          holding the eigenvector associated with W(i).
+*>          If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension LWORK
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = (2KD+1)*N + KD*NTHREADS
+*>                                   where KD is the size of the band.
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK, RWORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (max(1,3*N-2))
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  if INFO = i, the algorithm failed to converge; i
+*>                off-diagonal elements of an intermediate tridiagonal
+*>                form did not converge to zero.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16OTHEReigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZHBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+     $                         WORK, LWORK, RWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, KD, LDAB, LDZ, N, LWORK
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   RWORK( * ), W( * )
+      COMPLEX*16         AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, WANTZ, LQUERY
+      INTEGER            IINFO, IMAX, INDE, INDWRK, INDRWK, ISCALE,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS
+      DOUBLE PRECISION   ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, ZLANHB
+      EXTERNAL           LSAME, DLAMCH, ZLANHB, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DSCAL, DSTERF, XERBLA, ZLASCL, ZSTEQR
+     $                   ZHETRD_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDAB.LT.KD+1 ) THEN
+         INFO = -6
+      ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+         INFO = -9
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            WORK( 1 ) = LWMIN
+         ELSE
+            IB    = ILAENV( 18, 'ZHETRD_HB2ST', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+            LWMIN = LHTRD + LWTRD
+            WORK( 1 )  = LWMIN
+         ENDIF
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -11
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHBEV_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            W( 1 ) = DBLE( AB( 1, 1 ) )
+         ELSE
+            W( 1 ) = DBLE( AB( KD+1, 1 ) )
+         END IF
+         IF( WANTZ )
+     $      Z( 1, 1 ) = ONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = ZLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            CALL ZLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         ELSE
+            CALL ZLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         END IF
+      END IF
+*
+*     Call ZHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDHOUS = 1
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL ZHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+     $                    RWORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call DSTERF.  For eigenvectors, call ZSTEQR.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL DSTERF( N, W, RWORK( INDE ), INFO )
+      ELSE
+         INDRWK = INDE + N
+         CALL ZSTEQR( JOBZ, N, W, RWORK( INDE ), Z, LDZ,
+     $                RWORK( INDRWK ), INFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = N
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of ZHBEV_2STAGE
+*
+      END
diff --git a/SRC/zhbevd_2stage.f b/SRC/zhbevd_2stage.f
new file mode 100644
index 00000000..e4daae74
--- /dev/null
+++ b/SRC/zhbevd_2stage.f
@@ -0,0 +1,458 @@
+*> \brief <b> ZHBEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+*  @precisions fortran z -> s d c
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHBEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhbevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhbevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhbevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZHBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+*                                 WORK, LWORK, RWORK, LRWORK, IWORK, 
+*                                 LIWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, KD, LDAB, LDZ, LIWORK, LRWORK, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IWORK( * )
+*       DOUBLE PRECISION   RWORK( * ), W( * )
+*       COMPLEX*16         AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZHBEVD_2STAGE computes all the eigenvalues and, optionally, eigenvectors of
+*> a complex Hermitian band matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  If eigenvectors are desired, it
+*> uses a divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX*16 array, dimension (LDAB, N)
+*>          On entry, the upper or lower triangle of the Hermitian band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*>          On exit, AB is overwritten by values generated during the
+*>          reduction to tridiagonal form.  If UPLO = 'U', the first
+*>          superdiagonal and the diagonal of the tridiagonal matrix T
+*>          are returned in rows KD and KD+1 of AB, and if UPLO = 'L',
+*>          the diagonal and first subdiagonal of T are returned in the
+*>          first two rows of AB.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is COMPLEX*16 array, dimension (LDZ, N)
+*>          If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal
+*>          eigenvectors of the matrix A, with the i-th column of Z
+*>          holding the eigenvector associated with W(i).
+*>          If JOBZ = 'N', then Z is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = (2KD+1)*N + KD*NTHREADS
+*>                                   where KD is the size of the band.
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK, RWORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array,
+*>                                         dimension (LRWORK)
+*>          On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          The dimension of array RWORK.
+*>          If N <= 1,               LRWORK must be at least 1.
+*>          If JOBZ = 'N' and N > 1, LRWORK must be at least N.
+*>          If JOBZ = 'V' and N > 1, LRWORK must be at least
+*>                        1 + 5*N + 2*N**2.
+*>
+*>          If LRWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK, RWORK
+*>          and IWORK arrays, returns these values as the first entries
+*>          of the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of array IWORK.
+*>          If JOBZ = 'N' or N <= 1, LIWORK must be at least 1.
+*>          If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N .
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK, RWORK
+*>          and IWORK arrays, returns these values as the first entries
+*>          of the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  if INFO = i, the algorithm failed to converge; i
+*>                off-diagonal elements of an intermediate tridiagonal
+*>                form did not converge to zero.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16OTHEReigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZHBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ,
+     $                          WORK, LWORK, RWORK, LRWORK, IWORK, 
+     $                          LIWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, KD, LDAB, LDZ, LIWORK, LRWORK, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      DOUBLE PRECISION   RWORK( * ), W( * )
+      COMPLEX*16         AB( LDAB, * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+      COMPLEX*16         CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0D0, 0.0D0 ),
+     $                   CONE = ( 1.0D0, 0.0D0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, LQUERY, WANTZ
+      INTEGER            IINFO, IMAX, INDE, INDWK2, INDRWK, ISCALE,
+     $                   LLWORK, INDWK, LHTRD, LWTRD, IB, INDHOUS,
+     $                   LIWMIN, LLRWK, LLWK2, LRWMIN, LWMIN
+      DOUBLE PRECISION   ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, ZLANHB
+      EXTERNAL           LSAME, DLAMCH, ZLANHB, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DSCAL, DSTERF, XERBLA, ZGEMM, ZLACPY,
+     $                   ZLASCL, ZSTEDC, ZHETRD_HB2ST
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 .OR. LRWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( N.LE.1 ) THEN
+         LWMIN = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         IB    = ILAENV( 18, 'ZHETRD_HB2ST', JOBZ, N, KD, -1, -1 )
+         LHTRD = ILAENV( 19, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+         LWTRD = ILAENV( 20, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+         IF( WANTZ ) THEN
+            LWMIN = 2*N**2
+            LRWMIN = 1 + 5*N + 2*N**2
+            LIWMIN = 3 + 5*N
+         ELSE
+            LWMIN  = MAX( N, LHTRD + LWTRD )
+            LRWMIN = N
+            LIWMIN = 1
+         END IF
+      END IF
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDAB.LT.KD+1 ) THEN
+         INFO = -6
+      ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+         INFO = -9
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         WORK( 1 )  = LWMIN
+         RWORK( 1 ) = LRWMIN
+         IWORK( 1 ) = LIWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -13
+         ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHBEVD_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         W( 1 ) = DBLE( AB( 1, 1 ) )
+         IF( WANTZ )
+     $      Z( 1, 1 ) = CONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = ZLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            CALL ZLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         ELSE
+            CALL ZLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         END IF
+      END IF
+*
+*     Call ZHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDRWK  = INDE + N
+      LLRWK   = LRWORK - INDRWK + 1
+      INDHOUS = 1
+      INDWK   = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWK + 1
+      INDWK2  = INDWK + N*N
+      LLWK2   = LWORK - INDWK2 + 1
+*
+      CALL ZHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
+     $                    RWORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call DSTERF.  For eigenvectors, call ZSTEDC.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL DSTERF( N, W, RWORK( INDE ), INFO )
+      ELSE
+         CALL ZSTEDC( 'I', N, W, RWORK( INDE ), WORK, N, WORK( INDWK2 ),
+     $                LLWK2, RWORK( INDRWK ), LLRWK, IWORK, LIWORK,
+     $                INFO )
+         CALL ZGEMM( 'N', 'N', N, N, N, CONE, Z, LDZ, WORK, N, CZERO,
+     $               WORK( INDWK2 ), N )
+         CALL ZLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = N
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+      WORK( 1 )  = LWMIN
+      RWORK( 1 ) = LRWMIN
+      IWORK( 1 ) = LIWMIN
+      RETURN
+*
+*     End of ZHBEVD_2STAGE
+*
+      END
diff --git a/SRC/zhbevx_2stage.f b/SRC/zhbevx_2stage.f
new file mode 100644
index 00000000..3efdcc74
--- /dev/null
+++ b/SRC/zhbevx_2stage.f
@@ -0,0 +1,646 @@
+*> \brief <b> ZHBEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b>
+*
+*  @precisions fortran z -> s d c
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHBEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhbevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhbevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhbevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB,
+*                                 Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W,
+*                                 Z, LDZ, WORK, LWORK, RWORK, IWORK, 
+*                                 IFAIL, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, RANGE, UPLO
+*       INTEGER            IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+*       DOUBLE PRECISION   ABSTOL, VL, VU
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IFAIL( * ), IWORK( * )
+*       DOUBLE PRECISION   RWORK( * ), W( * )
+*       COMPLEX*16         AB( LDAB, * ), Q( LDQ, * ), WORK( * ),
+*      $                   Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZHBEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a complex Hermitian band matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  Eigenvalues and eigenvectors
+*> can be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*>          RANGE is CHARACTER*1
+*>          = 'A': all eigenvalues will be found;
+*>          = 'V': all eigenvalues in the half-open interval (VL,VU]
+*>                 will be found;
+*>          = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX*16 array, dimension (LDAB, N)
+*>          On entry, the upper or lower triangle of the Hermitian band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>
+*>          On exit, AB is overwritten by values generated during the
+*>          reduction to tridiagonal form.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD + 1.
+*> \endverbatim
+*>
+*> \param[out] Q
+*> \verbatim
+*>          Q is COMPLEX*16 array, dimension (LDQ, N)
+*>          If JOBZ = 'V', the N-by-N unitary matrix used in the
+*>                          reduction to tridiagonal form.
+*>          If JOBZ = 'N', the array Q is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDQ
+*> \verbatim
+*>          LDQ is INTEGER
+*>          The leading dimension of the array Q.  If JOBZ = 'V', then
+*>          LDQ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*>          VL is DOUBLE PRECISION
+*>          If RANGE='V', the lower bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*>          VU is DOUBLE PRECISION
+*>          If RANGE='V', the upper bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*>          IL is INTEGER
+*>          If RANGE='I', the index of the
+*>          smallest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*>          IU is INTEGER
+*>          If RANGE='I', the index of the
+*>          largest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION
+*>          The absolute error tolerance for the eigenvalues.
+*>          An approximate eigenvalue is accepted as converged
+*>          when it is determined to lie in an interval [a,b]
+*>          of width less than or equal to
+*>
+*>                  ABSTOL + EPS *   max( |a|,|b| ) ,
+*>
+*>          where EPS is the machine precision.  If ABSTOL is less than
+*>          or equal to zero, then  EPS*|T|  will be used in its place,
+*>          where |T| is the 1-norm of the tridiagonal matrix obtained
+*>          by reducing AB to tridiagonal form.
+*>
+*>          Eigenvalues will be computed most accurately when ABSTOL is
+*>          set to twice the underflow threshold 2*DLAMCH('S'), not zero.
+*>          If this routine returns with INFO>0, indicating that some
+*>          eigenvectors did not converge, try setting ABSTOL to
+*>          2*DLAMCH('S').
+*>
+*>          See "Computing Small Singular Values of Bidiagonal Matrices
+*>          with Guaranteed High Relative Accuracy," by Demmel and
+*>          Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*>          M is INTEGER
+*>          The total number of eigenvalues found.  0 <= M <= N.
+*>          If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          The first M elements contain the selected eigenvalues in
+*>          ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is COMPLEX*16 array, dimension (LDZ, max(1,M))
+*>          If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*>          contain the orthonormal eigenvectors of the matrix A
+*>          corresponding to the selected eigenvalues, with the i-th
+*>          column of Z holding the eigenvector associated with W(i).
+*>          If an eigenvector fails to converge, then that column of Z
+*>          contains the latest approximation to the eigenvector, and the
+*>          index of the eigenvector is returned in IFAIL.
+*>          If JOBZ = 'N', then Z is not referenced.
+*>          Note: the user must ensure that at least max(1,M) columns are
+*>          supplied in the array Z; if RANGE = 'V', the exact value of M
+*>          is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (LWORK)
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = (2KD+1)*N + KD*NTHREADS
+*>                                   where KD is the size of the band.
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK, RWORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (7*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*>          IFAIL is INTEGER array, dimension (N)
+*>          If JOBZ = 'V', then if INFO = 0, the first M elements of
+*>          IFAIL are zero.  If INFO > 0, then IFAIL contains the
+*>          indices of the eigenvectors that failed to converge.
+*>          If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, then i eigenvectors failed to converge.
+*>                Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup complex16OTHEReigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB,
+     $                          Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W,
+     $                          Z, LDZ, WORK, LWORK, RWORK, IWORK, 
+     $                          IFAIL, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, RANGE, UPLO
+      INTEGER            IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK
+      DOUBLE PRECISION   ABSTOL, VL, VU
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IFAIL( * ), IWORK( * )
+      DOUBLE PRECISION   RWORK( * ), W( * )
+      COMPLEX*16         AB( LDAB, * ), Q( LDQ, * ), WORK( * ),
+     $                   Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+      COMPLEX*16         CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0D0, 0.0D0 ),
+     $                   CONE = ( 1.0D0, 0.0D0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            ALLEIG, INDEIG, LOWER, TEST, VALEIG, WANTZ,
+     $                   LQUERY
+      CHARACTER          ORDER
+      INTEGER            I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+     $                   INDISP, INDIWK, INDRWK, INDWRK, ISCALE, ITMP1,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS,
+     $                   J, JJ, NSPLIT
+      DOUBLE PRECISION   ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+     $                   SIGMA, SMLNUM, TMP1, VLL, VUU
+      COMPLEX*16         CTMP1
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, ZLANHB
+      EXTERNAL           LSAME, DLAMCH, ZLANHB, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DSCAL, DSTEBZ, DSTERF, XERBLA, ZCOPY,
+     $                   ZGEMV, ZLACPY, ZLASCL, ZSTEIN, ZSTEQR,
+     $                   ZSWAP, ZHETRD_HB2ST
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      ALLEIG = LSAME( RANGE, 'A' )
+      VALEIG = LSAME( RANGE, 'V' )
+      INDEIG = LSAME( RANGE, 'I' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( LDAB.LT.KD+1 ) THEN
+         INFO = -7
+      ELSE IF( WANTZ .AND. LDQ.LT.MAX( 1, N ) ) THEN
+         INFO = -9
+      ELSE
+         IF( VALEIG ) THEN
+            IF( N.GT.0 .AND. VU.LE.VL )
+     $         INFO = -11
+         ELSE IF( INDEIG ) THEN
+            IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+               INFO = -12
+            ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+               INFO = -13
+            END IF
+         END IF
+      END IF
+      IF( INFO.EQ.0 ) THEN
+         IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) )
+     $      INFO = -18
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            WORK( 1 ) = LWMIN
+         ELSE
+            IB    = ILAENV( 18, 'ZHETRD_HB2ST', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 )
+            LWMIN = LHTRD + LWTRD
+            WORK( 1 )  = LWMIN
+         ENDIF
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -20
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHBEVX_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      M = 0
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         M = 1
+         IF( LOWER ) THEN
+            CTMP1 = AB( 1, 1 )
+         ELSE
+            CTMP1 = AB( KD+1, 1 )
+         END IF
+         TMP1 = DBLE( CTMP1 )
+         IF( VALEIG ) THEN
+            IF( .NOT.( VL.LT.TMP1 .AND. VU.GE.TMP1 ) )
+     $         M = 0
+         END IF
+         IF( M.EQ.1 ) THEN
+            W( 1 ) = DBLE( CTMP1 )
+            IF( WANTZ )
+     $         Z( 1, 1 ) = CONE
+         END IF
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ISCALE = 0
+      ABSTLL = ABSTOL
+      IF( VALEIG ) THEN
+         VLL = VL
+         VUU = VU
+      ELSE
+         VLL = ZERO
+         VUU = ZERO
+      END IF
+      ANRM = ZLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK )
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            CALL ZLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         ELSE
+            CALL ZLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO )
+         END IF
+         IF( ABSTOL.GT.0 )
+     $      ABSTLL = ABSTOL*SIGMA
+         IF( VALEIG ) THEN
+            VLL = VL*SIGMA
+            VUU = VU*SIGMA
+         END IF
+      END IF
+*
+*     Call ZHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form.
+*
+      INDD = 1
+      INDE = INDD + N
+      INDRWK = INDE + N
+*
+      INDHOUS = 1
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL ZHETRD_HB2ST( 'N', JOBZ, UPLO, N, KD, AB, LDAB,
+     $                    RWORK( INDD ), RWORK( INDE ), WORK( INDHOUS ),
+     $                    LHTRD, WORK( INDWRK ), LLWORK, IINFO )
+*
+*     If all eigenvalues are desired and ABSTOL is less than or equal
+*     to zero, then call DSTERF or ZSTEQR.  If this fails for some
+*     eigenvalue, then try DSTEBZ.
+*
+      TEST = .FALSE.
+      IF (INDEIG) THEN
+         IF (IL.EQ.1 .AND. IU.EQ.N) THEN
+            TEST = .TRUE.
+         END IF
+      END IF
+      IF ((ALLEIG .OR. TEST) .AND. (ABSTOL.LE.ZERO)) THEN
+         CALL DCOPY( N, RWORK( INDD ), 1, W, 1 )
+         INDEE = INDRWK + 2*N
+         IF( .NOT.WANTZ ) THEN
+            CALL DCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+            CALL DSTERF( N, W, RWORK( INDEE ), INFO )
+         ELSE
+            CALL ZLACPY( 'A', N, N, Q, LDQ, Z, LDZ )
+            CALL DCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+            CALL ZSTEQR( JOBZ, N, W, RWORK( INDEE ), Z, LDZ,
+     $                   RWORK( INDRWK ), INFO )
+            IF( INFO.EQ.0 ) THEN
+               DO 10 I = 1, N
+                  IFAIL( I ) = 0
+   10          CONTINUE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            M = N
+            GO TO 30
+         END IF
+         INFO = 0
+      END IF
+*
+*     Otherwise, call DSTEBZ and, if eigenvectors are desired, ZSTEIN.
+*
+      IF( WANTZ ) THEN
+         ORDER = 'B'
+      ELSE
+         ORDER = 'E'
+      END IF
+      INDIBL = 1
+      INDISP = INDIBL + N
+      INDIWK = INDISP + N
+      CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+     $             RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W,
+     $             IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ),
+     $             IWORK( INDIWK ), INFO )
+*
+      IF( WANTZ ) THEN
+         CALL ZSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W,
+     $                IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+     $                RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO )
+*
+*        Apply unitary matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by ZSTEIN.
+*
+         DO 20 J = 1, M
+            CALL ZCOPY( N, Z( 1, J ), 1, WORK( 1 ), 1 )
+            CALL ZGEMV( 'N', N, N, CONE, Q, LDQ, WORK, 1, CZERO,
+     $                  Z( 1, J ), 1 )
+   20    CONTINUE
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+   30 CONTINUE
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = M
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     If eigenvalues are not in order, then sort them, along with
+*     eigenvectors.
+*
+      IF( WANTZ ) THEN
+         DO 50 J = 1, M - 1
+            I = 0
+            TMP1 = W( J )
+            DO 40 JJ = J + 1, M
+               IF( W( JJ ).LT.TMP1 ) THEN
+                  I = JJ
+                  TMP1 = W( JJ )
+               END IF
+   40       CONTINUE
+*
+            IF( I.NE.0 ) THEN
+               ITMP1 = IWORK( INDIBL+I-1 )
+               W( I ) = W( J )
+               IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+               W( J ) = TMP1
+               IWORK( INDIBL+J-1 ) = ITMP1
+               CALL ZSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+               IF( INFO.NE.0 ) THEN
+                  ITMP1 = IFAIL( I )
+                  IFAIL( I ) = IFAIL( J )
+                  IFAIL( J ) = ITMP1
+               END IF
+            END IF
+   50    CONTINUE
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of ZHBEVX_2STAGE
+*
+      END
diff --git a/SRC/zheev_2stage.f b/SRC/zheev_2stage.f
new file mode 100644
index 00000000..5aca4da2
--- /dev/null
+++ b/SRC/zheev_2stage.f
@@ -0,0 +1,355 @@
+*> \brief <b> ZHEEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+*  @precisions fortran z -> s d c
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHEEV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zheev_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zheev_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zheev_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+*                                RWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, LDA, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   RWORK( * ), W( * )
+*       COMPLEX*16         A( LDA, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZHEEV_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA, N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*>          orthonormal eigenvectors of the matrix A.
+*>          If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*>          or the upper triangle (if UPLO='U') of A, including the
+*>          diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (max(1, 3*N-2))
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, the algorithm failed to converge; i
+*>                off-diagonal elements of an intermediate tridiagonal
+*>                form did not converge to zero.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16HEeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+     $                         RWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, LDA, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   RWORK( * ), W( * )
+      COMPLEX*16         A( LDA, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+      COMPLEX*16         CONE
+      PARAMETER          ( CONE = ( 1.0D0, 0.0D0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, LQUERY, WANTZ
+      INTEGER            IINFO, IMAX, INDE, INDTAU, INDWRK, ISCALE,
+     $                   LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+      DOUBLE PRECISION   ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, ZLANHE
+      EXTERNAL           LSAME, ILAENV, DLAMCH, ZLANHE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DSCAL, DSTERF, XERBLA, ZLASCL, ZSTEQR,
+     $                   ZUNGTR, ZHETRD_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         KD    = ILAENV( 17, 'ZHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+         IB    = ILAENV( 18, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+         LHTRD = ILAENV( 19, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWTRD = ILAENV( 20, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWMIN = N + LHTRD + LWTRD
+         WORK( 1 )  = LWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -8
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHEEV_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 ) THEN
+         RETURN
+      END IF
+*
+      IF( N.EQ.1 ) THEN
+         W( 1 ) = DBLE( A( 1, 1 ) )
+         WORK( 1 ) = 1
+         IF( WANTZ )
+     $      A( 1, 1 ) = CONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = ZLANHE( 'M', UPLO, N, A, LDA, RWORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 )
+     $   CALL ZLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+*     Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDTAU  = 1
+      INDHOUS = INDTAU + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ),
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call DSTERF.  For eigenvectors, first call
+*     ZUNGTR to generate the unitary matrix, then call ZSTEQR.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL DSTERF( N, W, RWORK( INDE ), INFO )
+      ELSE
+         CALL ZUNGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ),
+     $                LLWORK, IINFO )
+         INDWRK = INDE + N
+         CALL ZSTEQR( JOBZ, N, W, RWORK( INDE ), A, LDA,
+     $                RWORK( INDWRK ), INFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = N
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     Set WORK(1) to optimal complex workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of ZHEEV_2STAGE
+*
+      END
diff --git a/SRC/zheevd_2stage.f b/SRC/zheevd_2stage.f
new file mode 100644
index 00000000..79a0e886
--- /dev/null
+++ b/SRC/zheevd_2stage.f
@@ -0,0 +1,451 @@
+*> \brief <b> ZHEEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+*  @precisions fortran z -> s d c
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHEEVD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zheevd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zheevd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zheevd_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZHEEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+*                          RWORK, LRWORK, IWORK, LIWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, LDA, LIWORK, LRWORK, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IWORK( * )
+*       DOUBLE PRECISION   RWORK( * ), W( * )
+*       COMPLEX*16         A( LDA, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
+*> complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  If eigenvectors are desired, it uses a
+*> divide and conquer algorithm.
+*>
+*> The divide and conquer algorithm makes very mild assumptions about
+*> floating point arithmetic. It will work on machines with a guard
+*> digit in add/subtract, or on those binary machines without guard
+*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+*> without guard digits, but we know of none.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA, N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*>          orthonormal eigenvectors of the matrix A.
+*>          If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+*>          or the upper triangle (if UPLO='U') of A, including the
+*>          diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + N+1
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + N+1
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be at least 2*N + N**2
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK, RWORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array,
+*>                                         dimension (LRWORK)
+*>          On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          The dimension of the array RWORK.
+*>          If N <= 1,                LRWORK must be at least 1.
+*>          If JOBZ  = 'N' and N > 1, LRWORK must be at least N.
+*>          If JOBZ  = 'V' and N > 1, LRWORK must be at least
+*>                         1 + 5*N + 2*N**2.
+*>
+*>          If LRWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK, RWORK
+*>          and IWORK arrays, returns these values as the first entries
+*>          of the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.
+*>          If N <= 1,                LIWORK must be at least 1.
+*>          If JOBZ  = 'N' and N > 1, LIWORK must be at least 1.
+*>          If JOBZ  = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK, RWORK
+*>          and IWORK arrays, returns these values as the first entries
+*>          of the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i and JOBZ = 'N', then the algorithm failed
+*>                to converge; i off-diagonal elements of an intermediate
+*>                tridiagonal form did not converge to zero;
+*>                if INFO = i and JOBZ = 'V', then the algorithm failed
+*>                to compute an eigenvalue while working on the submatrix
+*>                lying in rows and columns INFO/(N+1) through
+*>                mod(INFO,N+1).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16HEeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*>  Modified description of INFO. Sven, 16 Feb 05.
+*
+*> \par Contributors:
+*  ==================
+*>
+*> Jeff Rutter, Computer Science Division, University of California
+*> at Berkeley, USA
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZHEEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
+     $                   RWORK, LRWORK, IWORK, LIWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, LDA, LIWORK, LRWORK, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      DOUBLE PRECISION   RWORK( * ), W( * )
+      COMPLEX*16         A( LDA, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+      COMPLEX*16         CONE
+      PARAMETER          ( CONE = ( 1.0D0, 0.0D0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LOWER, LQUERY, WANTZ
+      INTEGER            IINFO, IMAX, INDE, INDRWK, INDTAU, INDWK2,
+     $                   INDWRK, ISCALE, LIWMIN, LLRWK, LLWORK,
+     $                   LLWRK2, LRWMIN, LWMIN,
+     $                   LHTRD, LWTRD, KD, IB, INDHOUS
+
+
+      DOUBLE PRECISION   ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA,
+     $                   SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, ZLANHE
+      EXTERNAL           LSAME, ILAENV, DLAMCH, ZLANHE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DSCAL, DSTERF, XERBLA, ZLACPY, ZLASCL,
+     $                   ZSTEDC, ZUNMTR, ZHETRD_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, SQRT 
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      LOWER = LSAME( UPLO, 'L' )
+      LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 .OR. LIWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            LRWMIN = 1
+            LIWMIN = 1
+         ELSE
+            KD    = ILAENV( 17, 'ZHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+            IB    = ILAENV( 18, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            IF( WANTZ ) THEN
+               LWMIN = 2*N + N*N
+               LRWMIN = 1 + 5*N + 2*N**2
+               LIWMIN = 3 + 5*N
+            ELSE
+               LWMIN = N + 1 + LHTRD + LWTRD
+               LRWMIN = N
+               LIWMIN = 1
+            END IF
+         END IF
+         WORK( 1 )  = LWMIN
+         RWORK( 1 ) = LRWMIN
+         IWORK( 1 ) = LIWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -8
+         ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -10
+         ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -12
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHEEVD_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( N.EQ.1 ) THEN
+         W( 1 ) = DBLE( A( 1, 1 ) )
+         IF( WANTZ )
+     $      A( 1, 1 ) = CONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = SQRT( BIGNUM )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ANRM = ZLANHE( 'M', UPLO, N, A, LDA, RWORK )
+      ISCALE = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 )
+     $   CALL ZLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO )
+*
+*     Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+      INDE    = 1
+      INDRWK  = INDE + N
+      LLRWK   = LRWORK - INDRWK + 1
+      INDTAU  = 1
+      INDHOUS = INDTAU + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+      INDWK2  = INDWRK + N*N
+      LLWRK2  = LWORK - INDWK2 + 1
+*
+      CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ),
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWRK ), LLWORK, IINFO )
+*
+*     For eigenvalues only, call DSTERF.  For eigenvectors, first call
+*     ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
+*     tridiagonal matrix, then call ZUNMTR to multiply it to the
+*     Householder transformations represented as Householder vectors in
+*     A.
+*
+      IF( .NOT.WANTZ ) THEN
+         CALL DSTERF( N, W, RWORK( INDE ), INFO )
+      ELSE
+         CALL ZSTEDC( 'I', N, W, RWORK( INDE ), WORK( INDWRK ), N,
+     $                WORK( INDWK2 ), LLWRK2, RWORK( INDRWK ), LLRWK,
+     $                IWORK, LIWORK, INFO )
+         CALL ZUNMTR( 'L', UPLO, 'N', N, N, A, LDA, WORK( INDTAU ),
+     $                WORK( INDWRK ), N, WORK( INDWK2 ), LLWRK2, IINFO )
+         CALL ZLACPY( 'A', N, N, WORK( INDWRK ), N, A, LDA )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = N
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+      WORK( 1 )  = LWMIN
+      RWORK( 1 ) = LRWMIN
+      IWORK( 1 ) = LIWMIN
+*
+      RETURN
+*
+*     End of ZHEEVD_2STAGE
+*
+      END
diff --git a/SRC/zheevr_2stage.f b/SRC/zheevr_2stage.f
new file mode 100644
index 00000000..bfd43056
--- /dev/null
+++ b/SRC/zheevr_2stage.f
@@ -0,0 +1,779 @@
+*> \brief <b> ZHEEVR_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+*  @precisions fortran z -> s d c
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHEEVR_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zheevr_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zheevr_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zheevr_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZHEEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+*                                 IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ,
+*                                 WORK, LWORK, RWORK, LRWORK, IWORK,
+*                                 LIWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, RANGE, UPLO
+*       INTEGER            IL, INFO, IU, LDA, LDZ, LIWORK, LRWORK, LWORK,
+*      $                   M, N
+*       DOUBLE PRECISION   ABSTOL, VL, VU
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            ISUPPZ( * ), IWORK( * )
+*       DOUBLE PRECISION   RWORK( * ), W( * )
+*       COMPLEX*16         A( LDA, * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZHEEVR_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  Eigenvalues and eigenvectors can
+*> be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*>
+*> ZHEEVR_2STAGE first reduces the matrix A to tridiagonal form T with a call
+*> to ZHETRD.  Then, whenever possible, ZHEEVR_2STAGE calls ZSTEMR to compute
+*> eigenspectrum using Relatively Robust Representations.  ZSTEMR
+*> computes eigenvalues by the dqds algorithm, while orthogonal
+*> eigenvectors are computed from various "good" L D L^T representations
+*> (also known as Relatively Robust Representations). Gram-Schmidt
+*> orthogonalization is avoided as far as possible. More specifically,
+*> the various steps of the algorithm are as follows.
+*>
+*> For each unreduced block (submatrix) of T,
+*>    (a) Compute T - sigma I  = L D L^T, so that L and D
+*>        define all the wanted eigenvalues to high relative accuracy.
+*>        This means that small relative changes in the entries of D and L
+*>        cause only small relative changes in the eigenvalues and
+*>        eigenvectors. The standard (unfactored) representation of the
+*>        tridiagonal matrix T does not have this property in general.
+*>    (b) Compute the eigenvalues to suitable accuracy.
+*>        If the eigenvectors are desired, the algorithm attains full
+*>        accuracy of the computed eigenvalues only right before
+*>        the corresponding vectors have to be computed, see steps c) and d).
+*>    (c) For each cluster of close eigenvalues, select a new
+*>        shift close to the cluster, find a new factorization, and refine
+*>        the shifted eigenvalues to suitable accuracy.
+*>    (d) For each eigenvalue with a large enough relative separation compute
+*>        the corresponding eigenvector by forming a rank revealing twisted
+*>        factorization. Go back to (c) for any clusters that remain.
+*>
+*> The desired accuracy of the output can be specified by the input
+*> parameter ABSTOL.
+*>
+*> For more details, see DSTEMR's documentation and:
+*> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+*>   to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+*>   Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+*> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+*>   Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+*>   2004.  Also LAPACK Working Note 154.
+*> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+*>   tridiagonal eigenvalue/eigenvector problem",
+*>   Computer Science Division Technical Report No. UCB/CSD-97-971,
+*>   UC Berkeley, May 1997.
+*>
+*>
+*> Note 1 : ZHEEVR_2STAGE calls ZSTEMR when the full spectrum is requested
+*> on machines which conform to the ieee-754 floating point standard.
+*> ZHEEVR_2STAGE calls DSTEBZ and ZSTEIN on non-ieee machines and
+*> when partial spectrum requests are made.
+*>
+*> Normal execution of ZSTEMR may create NaNs and infinities and
+*> hence may abort due to a floating point exception in environments
+*> which do not handle NaNs and infinities in the ieee standard default
+*> manner.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*>          RANGE is CHARACTER*1
+*>          = 'A': all eigenvalues will be found.
+*>          = 'V': all eigenvalues in the half-open interval (VL,VU]
+*>                 will be found.
+*>          = 'I': the IL-th through IU-th eigenvalues will be found.
+*>          For RANGE = 'V' or 'I' and IU - IL < N - 1, DSTEBZ and
+*>          ZSTEIN are called
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA, N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, the lower triangle (if UPLO='L') or the upper
+*>          triangle (if UPLO='U') of A, including the diagonal, is
+*>          destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*>          VL is DOUBLE PRECISION
+*>          If RANGE='V', the lower bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*>          VU is DOUBLE PRECISION
+*>          If RANGE='V', the upper bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*>          IL is INTEGER
+*>          If RANGE='I', the index of the
+*>          smallest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*>          IU is INTEGER
+*>          If RANGE='I', the index of the
+*>          largest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION
+*>          The absolute error tolerance for the eigenvalues.
+*>          An approximate eigenvalue is accepted as converged
+*>          when it is determined to lie in an interval [a,b]
+*>          of width less than or equal to
+*>
+*>                  ABSTOL + EPS *   max( |a|,|b| ) ,
+*>
+*>          where EPS is the machine precision.  If ABSTOL is less than
+*>          or equal to zero, then  EPS*|T|  will be used in its place,
+*>          where |T| is the 1-norm of the tridiagonal matrix obtained
+*>          by reducing A to tridiagonal form.
+*>
+*>          See "Computing Small Singular Values of Bidiagonal Matrices
+*>          with Guaranteed High Relative Accuracy," by Demmel and
+*>          Kahan, LAPACK Working Note #3.
+*>
+*>          If high relative accuracy is important, set ABSTOL to
+*>          DLAMCH( 'Safe minimum' ).  Doing so will guarantee that
+*>          eigenvalues are computed to high relative accuracy when
+*>          possible in future releases.  The current code does not
+*>          make any guarantees about high relative accuracy, but
+*>          furutre releases will. See J. Barlow and J. Demmel,
+*>          "Computing Accurate Eigensystems of Scaled Diagonally
+*>          Dominant Matrices", LAPACK Working Note #7, for a discussion
+*>          of which matrices define their eigenvalues to high relative
+*>          accuracy.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*>          M is INTEGER
+*>          The total number of eigenvalues found.  0 <= M <= N.
+*>          If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          The first M elements contain the selected eigenvalues in
+*>          ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is COMPLEX*16 array, dimension (LDZ, max(1,M))
+*>          If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*>          contain the orthonormal eigenvectors of the matrix A
+*>          corresponding to the selected eigenvalues, with the i-th
+*>          column of Z holding the eigenvector associated with W(i).
+*>          If JOBZ = 'N', then Z is not referenced.
+*>          Note: the user must ensure that at least max(1,M) columns are
+*>          supplied in the array Z; if RANGE = 'V', the exact value of M
+*>          is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] ISUPPZ
+*> \verbatim
+*>          ISUPPZ is INTEGER array, dimension ( 2*max(1,M) )
+*>          The support of the eigenvectors in Z, i.e., the indices
+*>          indicating the nonzero elements in Z. The i-th eigenvector
+*>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
+*>          ISUPPZ( 2*i ). This is an output of ZSTEMR (tridiagonal
+*>          matrix). The support of the eigenvectors of A is typically 
+*>          1:N because of the unitary transformations applied by ZUNMTR.
+*>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, 26*N, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal sizes of the WORK, RWORK and
+*>          IWORK arrays, returns these values as the first entries of
+*>          the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK))
+*>          On exit, if INFO = 0, RWORK(1) returns the optimal
+*>          (and minimal) LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>
+*>          If LRWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK, RWORK
+*>          and IWORK arrays, returns these values as the first entries
+*>          of the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal
+*>          (and minimal) LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the
+*>          routine only calculates the optimal sizes of the WORK, RWORK
+*>          and IWORK arrays, returns these values as the first entries
+*>          of the WORK, RWORK and IWORK arrays, and no error message
+*>          related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  Internal error
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup complex16HEeigen
+*
+*> \par Contributors:
+*  ==================
+*>
+*>     Inderjit Dhillon, IBM Almaden, USA \n
+*>     Osni Marques, LBNL/NERSC, USA \n
+*>     Ken Stanley, Computer Science Division, University of
+*>       California at Berkeley, USA \n
+*>     Jason Riedy, Computer Science Division, University of
+*>       California at Berkeley, USA \n
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZHEEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+     $                          IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ,
+     $                          WORK, LWORK, RWORK, LRWORK, IWORK,
+     $                          LIWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, RANGE, UPLO
+      INTEGER            IL, INFO, IU, LDA, LDZ, LIWORK, LRWORK, LWORK,
+     $                   M, N
+      DOUBLE PRECISION   ABSTOL, VL, VU
+*     ..
+*     .. Array Arguments ..
+      INTEGER            ISUPPZ( * ), IWORK( * )
+      DOUBLE PRECISION   RWORK( * ), W( * )
+      COMPLEX*16         A( LDA, * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG,
+     $                   WANTZ, TRYRAC
+      CHARACTER          ORDER
+      INTEGER            I, IEEEOK, IINFO, IMAX, INDIBL, INDIFL, INDISP,
+     $                   INDIWO, INDRD, INDRDD, INDRE, INDREE, INDRWK,
+     $                   INDTAU, INDWK, INDWKN, ISCALE, ITMP1, J, JJ,
+     $                   LIWMIN, LLWORK, LLRWORK, LLWRKN, LRWMIN,
+     $                   LWMIN, NSPLIT, LHTRD, LWTRD, KD, IB, INDHOUS
+      DOUBLE PRECISION   ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+     $                   SIGMA, SMLNUM, TMP1, VLL, VUU
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, ZLANSY
+      EXTERNAL           LSAME, ILAENV, DLAMCH, ZLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DSCAL, DSTEBZ, DSTERF, XERBLA, ZDSCAL,
+     $                   ZHETRD_2STAGE, ZSTEMR, ZSTEIN, ZSWAP, ZUNMTR
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IEEEOK = ILAENV( 10, 'ZHEEVR', 'N', 1, 2, 3, 4 )
+*
+      LOWER = LSAME( UPLO, 'L' )
+      WANTZ = LSAME( JOBZ, 'V' )
+      ALLEIG = LSAME( RANGE, 'A' )
+      VALEIG = LSAME( RANGE, 'V' )
+      INDEIG = LSAME( RANGE, 'I' )
+*
+      LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR.
+     $         ( LIWORK.EQ.-1 ) )
+*
+      KD     = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 )
+      IB     = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
+      LHTRD  = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+      LWTRD  = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
+      LWMIN  = N + LHTRD + LWTRD
+      LRWMIN = MAX( 1, 24*N )
+      LIWMIN = MAX( 1, 10*N )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE
+         IF( VALEIG ) THEN
+            IF( N.GT.0 .AND. VU.LE.VL )
+     $         INFO = -8
+         ELSE IF( INDEIG ) THEN
+            IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+               INFO = -9
+            ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+               INFO = -10
+            END IF
+         END IF
+      END IF
+      IF( INFO.EQ.0 ) THEN
+         IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         WORK( 1 )  = LWMIN
+         RWORK( 1 ) = LRWMIN
+         IWORK( 1 ) = LIWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -18
+         ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -20
+         ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -22
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHEEVR_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      M = 0
+      IF( N.EQ.0 ) THEN
+         WORK( 1 ) = 1
+         RETURN
+      END IF
+*
+      IF( N.EQ.1 ) THEN
+         WORK( 1 ) = 2
+         IF( ALLEIG .OR. INDEIG ) THEN
+            M = 1
+            W( 1 ) = DBLE( A( 1, 1 ) )
+         ELSE
+            IF( VL.LT.DBLE( A( 1, 1 ) ) .AND. VU.GE.DBLE( A( 1, 1 ) ) )
+     $           THEN
+               M = 1
+               W( 1 ) = DBLE( A( 1, 1 ) )
+            END IF
+         END IF
+         IF( WANTZ ) THEN
+            Z( 1, 1 ) = ONE
+            ISUPPZ( 1 ) = 1
+            ISUPPZ( 2 ) = 1
+         END IF
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ISCALE = 0
+      ABSTLL = ABSTOL
+      IF (VALEIG) THEN
+         VLL = VL
+         VUU = VU
+      END IF
+      ANRM = ZLANSY( 'M', UPLO, N, A, LDA, RWORK )
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            DO 10 J = 1, N
+               CALL ZDSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+   10       CONTINUE
+         ELSE
+            DO 20 J = 1, N
+               CALL ZDSCAL( J, SIGMA, A( 1, J ), 1 )
+   20       CONTINUE
+         END IF
+         IF( ABSTOL.GT.0 )
+     $      ABSTLL = ABSTOL*SIGMA
+         IF( VALEIG ) THEN
+            VLL = VL*SIGMA
+            VUU = VU*SIGMA
+         END IF
+      END IF
+
+*     Initialize indices into workspaces.  Note: The IWORK indices are
+*     used only if DSTERF or ZSTEMR fail.
+
+*     WORK(INDTAU:INDTAU+N-1) stores the complex scalar factors of the
+*     elementary reflectors used in ZHETRD.
+      INDTAU = 1
+*     INDWK is the starting offset of the remaining complex workspace,
+*     and LLWORK is the remaining complex workspace size.
+      INDHOUS = INDTAU + N
+      INDWK   = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWK + 1
+
+*     RWORK(INDRD:INDRD+N-1) stores the real tridiagonal's diagonal
+*     entries.
+      INDRD = 1
+*     RWORK(INDRE:INDRE+N-1) stores the off-diagonal entries of the
+*     tridiagonal matrix from ZHETRD.
+      INDRE = INDRD + N
+*     RWORK(INDRDD:INDRDD+N-1) is a copy of the diagonal entries over
+*     -written by ZSTEMR (the DSTERF path copies the diagonal to W).
+      INDRDD = INDRE + N
+*     RWORK(INDREE:INDREE+N-1) is a copy of the off-diagonal entries over
+*     -written while computing the eigenvalues in DSTERF and ZSTEMR.
+      INDREE = INDRDD + N
+*     INDRWK is the starting offset of the left-over real workspace, and
+*     LLRWORK is the remaining workspace size.
+      INDRWK = INDREE + N
+      LLRWORK = LRWORK - INDRWK + 1
+
+*     IWORK(INDIBL:INDIBL+M-1) corresponds to IBLOCK in DSTEBZ and
+*     stores the block indices of each of the M<=N eigenvalues.
+      INDIBL = 1
+*     IWORK(INDISP:INDISP+NSPLIT-1) corresponds to ISPLIT in DSTEBZ and
+*     stores the starting and finishing indices of each block.
+      INDISP = INDIBL + N
+*     IWORK(INDIFL:INDIFL+N-1) stores the indices of eigenvectors
+*     that corresponding to eigenvectors that fail to converge in
+*     ZSTEIN.  This information is discarded; if any fail, the driver
+*     returns INFO > 0.
+      INDIFL = INDISP + N
+*     INDIWO is the offset of the remaining integer workspace.
+      INDIWO = INDIFL + N
+
+*
+*     Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+      CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), 
+     $                    RWORK( INDRE ), WORK( INDTAU ),
+     $                    WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDWK ), LLWORK, IINFO )
+*
+*     If all eigenvalues are desired
+*     then call DSTERF or ZSTEMR and ZUNMTR.
+*
+      TEST = .FALSE.
+      IF( INDEIG ) THEN
+         IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+            TEST = .TRUE.
+         END IF
+      END IF
+      IF( ( ALLEIG.OR.TEST ) .AND. ( IEEEOK.EQ.1 ) ) THEN
+         IF( .NOT.WANTZ ) THEN
+            CALL DCOPY( N, RWORK( INDRD ), 1, W, 1 )
+            CALL DCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 )
+            CALL DSTERF( N, W, RWORK( INDREE ), INFO )
+         ELSE
+            CALL DCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 )
+            CALL DCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 )
+*
+            IF (ABSTOL .LE. TWO*N*EPS) THEN
+               TRYRAC = .TRUE.
+            ELSE
+               TRYRAC = .FALSE.
+            END IF
+            CALL ZSTEMR( JOBZ, 'A', N, RWORK( INDRDD ),
+     $                   RWORK( INDREE ), VL, VU, IL, IU, M, W,
+     $                   Z, LDZ, N, ISUPPZ, TRYRAC,
+     $                   RWORK( INDRWK ), LLRWORK,
+     $                   IWORK, LIWORK, INFO )
+*
+*           Apply unitary matrix used in reduction to tridiagonal
+*           form to eigenvectors returned by ZSTEMR.
+*
+            IF( WANTZ .AND. INFO.EQ.0 ) THEN
+               INDWKN = INDWK
+               LLWRKN = LWORK - INDWKN + 1
+               CALL ZUNMTR( 'L', UPLO, 'N', N, M, A, LDA,
+     $                      WORK( INDTAU ), Z, LDZ, WORK( INDWKN ),
+     $                      LLWRKN, IINFO )
+            END IF
+         END IF
+*
+*
+         IF( INFO.EQ.0 ) THEN
+            M = N
+            GO TO 30
+         END IF
+         INFO = 0
+      END IF
+*
+*     Otherwise, call DSTEBZ and, if eigenvectors are desired, ZSTEIN.
+*     Also call DSTEBZ and ZSTEIN if ZSTEMR fails.
+*
+      IF( WANTZ ) THEN
+         ORDER = 'B'
+      ELSE
+         ORDER = 'E'
+      END IF
+
+      CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+     $             RWORK( INDRD ), RWORK( INDRE ), M, NSPLIT, W,
+     $             IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ),
+     $             IWORK( INDIWO ), INFO )
+*
+      IF( WANTZ ) THEN
+         CALL ZSTEIN( N, RWORK( INDRD ), RWORK( INDRE ), M, W,
+     $                IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+     $                RWORK( INDRWK ), IWORK( INDIWO ), IWORK( INDIFL ),
+     $                INFO )
+*
+*        Apply unitary matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by ZSTEIN.
+*
+         INDWKN = INDWK
+         LLWRKN = LWORK - INDWKN + 1
+         CALL ZUNMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+     $                LDZ, WORK( INDWKN ), LLWRKN, IINFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+   30 CONTINUE
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = M
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     If eigenvalues are not in order, then sort them, along with
+*     eigenvectors.
+*
+      IF( WANTZ ) THEN
+         DO 50 J = 1, M - 1
+            I = 0
+            TMP1 = W( J )
+            DO 40 JJ = J + 1, M
+               IF( W( JJ ).LT.TMP1 ) THEN
+                  I = JJ
+                  TMP1 = W( JJ )
+               END IF
+   40       CONTINUE
+*
+            IF( I.NE.0 ) THEN
+               ITMP1 = IWORK( INDIBL+I-1 )
+               W( I ) = W( J )
+               IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+               W( J ) = TMP1
+               IWORK( INDIBL+J-1 ) = ITMP1
+               CALL ZSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+            END IF
+   50    CONTINUE
+      END IF
+*
+*     Set WORK(1) to optimal workspace size.
+*
+      WORK( 1 )  = LWMIN
+      RWORK( 1 ) = LRWMIN
+      IWORK( 1 ) = LIWMIN
+*
+      RETURN
+*
+*     End of ZHEEVR_2STAGE
+*
+      END
diff --git a/SRC/zheevx_2stage.f b/SRC/zheevx_2stage.f
new file mode 100644
index 00000000..e33d55e0
--- /dev/null
+++ b/SRC/zheevx_2stage.f
@@ -0,0 +1,618 @@
+*> \brief <b> ZHEEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b>
+*
+*  @precisions fortran z -> s d c
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHEEVX_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zheevx_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zheevx_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zheevx_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZHEEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+*                                 IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+*                                 LWORK, RWORK, IWORK, IFAIL, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, RANGE, UPLO
+*       INTEGER            IL, INFO, IU, LDA, LDZ, LWORK, M, N
+*       DOUBLE PRECISION   ABSTOL, VL, VU
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IFAIL( * ), IWORK( * )
+*       DOUBLE PRECISION   RWORK( * ), W( * )
+*       COMPLEX*16         A( LDA, * ), WORK( * ), Z( LDZ, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZHEEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors
+*> of a complex Hermitian matrix A using the 2stage technique for
+*> the reduction to tridiagonal.  Eigenvalues and eigenvectors can
+*> be selected by specifying either a range of values or a range of
+*> indices for the desired eigenvalues.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] RANGE
+*> \verbatim
+*>          RANGE is CHARACTER*1
+*>          = 'A': all eigenvalues will be found.
+*>          = 'V': all eigenvalues in the half-open interval (VL,VU]
+*>                 will be found.
+*>          = 'I': the IL-th through IU-th eigenvalues will be found.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA, N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>          On exit, the lower triangle (if UPLO='L') or the upper
+*>          triangle (if UPLO='U') of A, including the diagonal, is
+*>          destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in] VL
+*> \verbatim
+*>          VL is DOUBLE PRECISION
+*>          If RANGE='V', the lower bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] VU
+*> \verbatim
+*>          VU is DOUBLE PRECISION
+*>          If RANGE='V', the upper bound of the interval to
+*>          be searched for eigenvalues. VL < VU.
+*>          Not referenced if RANGE = 'A' or 'I'.
+*> \endverbatim
+*>
+*> \param[in] IL
+*> \verbatim
+*>          IL is INTEGER
+*>          If RANGE='I', the index of the
+*>          smallest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] IU
+*> \verbatim
+*>          IU is INTEGER
+*>          If RANGE='I', the index of the
+*>          largest eigenvalue to be returned.
+*>          1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
+*>          Not referenced if RANGE = 'A' or 'V'.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION
+*>          The absolute error tolerance for the eigenvalues.
+*>          An approximate eigenvalue is accepted as converged
+*>          when it is determined to lie in an interval [a,b]
+*>          of width less than or equal to
+*>
+*>                  ABSTOL + EPS *   max( |a|,|b| ) ,
+*>
+*>          where EPS is the machine precision.  If ABSTOL is less than
+*>          or equal to zero, then  EPS*|T|  will be used in its place,
+*>          where |T| is the 1-norm of the tridiagonal matrix obtained
+*>          by reducing A to tridiagonal form.
+*>
+*>          Eigenvalues will be computed most accurately when ABSTOL is
+*>          set to twice the underflow threshold 2*DLAMCH('S'), not zero.
+*>          If this routine returns with INFO>0, indicating that some
+*>          eigenvectors did not converge, try setting ABSTOL to
+*>          2*DLAMCH('S').
+*>
+*>          See "Computing Small Singular Values of Bidiagonal Matrices
+*>          with Guaranteed High Relative Accuracy," by Demmel and
+*>          Kahan, LAPACK Working Note #3.
+*> \endverbatim
+*>
+*> \param[out] M
+*> \verbatim
+*>          M is INTEGER
+*>          The total number of eigenvalues found.  0 <= M <= N.
+*>          If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          On normal exit, the first M elements contain the selected
+*>          eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] Z
+*> \verbatim
+*>          Z is COMPLEX*16 array, dimension (LDZ, max(1,M))
+*>          If JOBZ = 'V', then if INFO = 0, the first M columns of Z
+*>          contain the orthonormal eigenvectors of the matrix A
+*>          corresponding to the selected eigenvalues, with the i-th
+*>          column of Z holding the eigenvector associated with W(i).
+*>          If an eigenvector fails to converge, then that column of Z
+*>          contains the latest approximation to the eigenvector, and the
+*>          index of the eigenvector is returned in IFAIL.
+*>          If JOBZ = 'N', then Z is not referenced.
+*>          Note: the user must ensure that at least max(1,M) columns are
+*>          supplied in the array Z; if RANGE = 'V', the exact value of M
+*>          is not known in advance and an upper bound must be used.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*>          LDZ is INTEGER
+*>          The leading dimension of the array Z.  LDZ >= 1, and if
+*>          JOBZ = 'V', LDZ >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, 8*N, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (7*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (5*N)
+*> \endverbatim
+*>
+*> \param[out] IFAIL
+*> \verbatim
+*>          IFAIL is INTEGER array, dimension (N)
+*>          If JOBZ = 'V', then if INFO = 0, the first M elements of
+*>          IFAIL are zero.  If INFO > 0, then IFAIL contains the
+*>          indices of the eigenvectors that failed to converge.
+*>          If JOBZ = 'N', then IFAIL is not referenced.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, then i eigenvectors failed to converge.
+*>                Their indices are stored in array IFAIL.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date June 2016
+*
+*> \ingroup complex16HEeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZHEEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU,
+     $                          IL, IU, ABSTOL, M, W, Z, LDZ, WORK,
+     $                          LWORK, RWORK, IWORK, IFAIL, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, RANGE, UPLO
+      INTEGER            IL, INFO, IU, LDA, LDZ, LWORK, M, N
+      DOUBLE PRECISION   ABSTOL, VL, VU
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IFAIL( * ), IWORK( * )
+      DOUBLE PRECISION   RWORK( * ), W( * )
+      COMPLEX*16         A( LDA, * ), WORK( * ), Z( LDZ, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+      COMPLEX*16         CONE
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG,
+     $                   WANTZ
+      CHARACTER          ORDER
+      INTEGER            I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
+     $                   INDISP, INDIWK, INDRWK, INDTAU, INDWRK, ISCALE,
+     $                   ITMP1, J, JJ, LLWORK, 
+     $                   NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
+      DOUBLE PRECISION   ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
+     $                   SIGMA, SMLNUM, TMP1, VLL, VUU
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, ZLANHE
+      EXTERNAL           LSAME, ILAENV, DLAMCH, ZLANHE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DSCAL, DSTEBZ, DSTERF, XERBLA, ZDSCAL,
+     $                   ZLACPY, ZSTEIN, ZSTEQR, ZSWAP, ZUNGTR, ZUNMTR,
+     $                   ZHETRD_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      LOWER = LSAME( UPLO, 'L' )
+      WANTZ = LSAME( JOBZ, 'V' )
+      ALLEIG = LSAME( RANGE, 'A' )
+      VALEIG = LSAME( RANGE, 'V' )
+      INDEIG = LSAME( RANGE, 'I' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE
+         IF( VALEIG ) THEN
+            IF( N.GT.0 .AND. VU.LE.VL )
+     $         INFO = -8
+         ELSE IF( INDEIG ) THEN
+            IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN
+               INFO = -9
+            ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN
+               INFO = -10
+            END IF
+         END IF
+      END IF
+      IF( INFO.EQ.0 ) THEN
+         IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         IF( N.LE.1 ) THEN
+            LWMIN = 1
+            WORK( 1 ) = LWMIN
+         ELSE
+            KD    = ILAENV( 17, 'ZHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+            IB    = ILAENV( 18, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+            LHTRD = ILAENV( 19, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            LWTRD = ILAENV( 20, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+            LWMIN = N + LHTRD + LWTRD
+            WORK( 1 )  = LWMIN
+         END IF
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
+     $      INFO = -17
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHEEVX_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      M = 0
+      IF( N.EQ.0 ) THEN
+         RETURN
+      END IF
+*
+      IF( N.EQ.1 ) THEN
+         IF( ALLEIG .OR. INDEIG ) THEN
+            M = 1
+         W( 1 ) = DBLE( A( 1, 1 ) )
+         ELSE IF( VALEIG ) THEN
+            IF( VL.LT.DBLE( A( 1, 1 ) ) .AND. VU.GE.DBLE( A( 1, 1 ) ) )
+     $           THEN
+               M = 1
+               W( 1 ) = DBLE( A( 1, 1 ) )
+            END IF
+         END IF
+         IF( WANTZ )
+     $      Z( 1, 1 ) = CONE
+         RETURN
+      END IF
+*
+*     Get machine constants.
+*
+      SAFMIN = DLAMCH( 'Safe minimum' )
+      EPS    = DLAMCH( 'Precision' )
+      SMLNUM = SAFMIN / EPS
+      BIGNUM = ONE / SMLNUM
+      RMIN   = SQRT( SMLNUM )
+      RMAX   = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
+*
+*     Scale matrix to allowable range, if necessary.
+*
+      ISCALE = 0
+      ABSTLL = ABSTOL
+      IF( VALEIG ) THEN
+         VLL = VL
+         VUU = VU
+      END IF
+      ANRM = ZLANHE( 'M', UPLO, N, A, LDA, RWORK )
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN
+         ISCALE = 1
+         SIGMA = RMIN / ANRM
+      ELSE IF( ANRM.GT.RMAX ) THEN
+         ISCALE = 1
+         SIGMA = RMAX / ANRM
+      END IF
+      IF( ISCALE.EQ.1 ) THEN
+         IF( LOWER ) THEN
+            DO 10 J = 1, N
+               CALL ZDSCAL( N-J+1, SIGMA, A( J, J ), 1 )
+   10       CONTINUE
+         ELSE
+            DO 20 J = 1, N
+               CALL ZDSCAL( J, SIGMA, A( 1, J ), 1 )
+   20       CONTINUE
+         END IF
+         IF( ABSTOL.GT.0 )
+     $      ABSTLL = ABSTOL*SIGMA
+         IF( VALEIG ) THEN
+            VLL = VL*SIGMA
+            VUU = VU*SIGMA
+         END IF
+      END IF
+*
+*     Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
+*
+      INDD    = 1
+      INDE    = INDD + N
+      INDRWK  = INDE + N
+      INDTAU  = 1
+      INDHOUS = INDTAU + N
+      INDWRK  = INDHOUS + LHTRD
+      LLWORK  = LWORK - INDWRK + 1
+*
+      CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDD ),
+     $                    RWORK( INDE ), WORK( INDTAU ), 
+     $                    WORK( INDHOUS ), LHTRD, WORK( INDWRK ),
+     $                    LLWORK, IINFO )
+*
+*     If all eigenvalues are desired and ABSTOL is less than or equal to
+*     zero, then call DSTERF or ZUNGTR and ZSTEQR.  If this fails for
+*     some eigenvalue, then try DSTEBZ.
+*
+      TEST = .FALSE.
+      IF( INDEIG ) THEN
+         IF( IL.EQ.1 .AND. IU.EQ.N ) THEN
+            TEST = .TRUE.
+         END IF
+      END IF
+      IF( ( ALLEIG .OR. TEST ) .AND. ( ABSTOL.LE.ZERO ) ) THEN
+         CALL DCOPY( N, RWORK( INDD ), 1, W, 1 )
+         INDEE = INDRWK + 2*N
+         IF( .NOT.WANTZ ) THEN
+            CALL DCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+            CALL DSTERF( N, W, RWORK( INDEE ), INFO )
+         ELSE
+            CALL ZLACPY( 'A', N, N, A, LDA, Z, LDZ )
+            CALL ZUNGTR( UPLO, N, Z, LDZ, WORK( INDTAU ),
+     $                   WORK( INDWRK ), LLWORK, IINFO )
+            CALL DCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 )
+            CALL ZSTEQR( JOBZ, N, W, RWORK( INDEE ), Z, LDZ,
+     $                   RWORK( INDRWK ), INFO )
+            IF( INFO.EQ.0 ) THEN
+               DO 30 I = 1, N
+                  IFAIL( I ) = 0
+   30          CONTINUE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            M = N
+            GO TO 40
+         END IF
+         INFO = 0
+      END IF
+*
+*     Otherwise, call DSTEBZ and, if eigenvectors are desired, ZSTEIN.
+*
+      IF( WANTZ ) THEN
+         ORDER = 'B'
+      ELSE
+         ORDER = 'E'
+      END IF
+      INDIBL = 1
+      INDISP = INDIBL + N
+      INDIWK = INDISP + N
+      CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL,
+     $             RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W,
+     $             IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ),
+     $             IWORK( INDIWK ), INFO )
+*
+      IF( WANTZ ) THEN
+         CALL ZSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W,
+     $                IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ,
+     $                RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO )
+*
+*        Apply unitary matrix used in reduction to tridiagonal
+*        form to eigenvectors returned by ZSTEIN.
+*
+         CALL ZUNMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z,
+     $                LDZ, WORK( INDWRK ), LLWORK, IINFO )
+      END IF
+*
+*     If matrix was scaled, then rescale eigenvalues appropriately.
+*
+   40 CONTINUE
+      IF( ISCALE.EQ.1 ) THEN
+         IF( INFO.EQ.0 ) THEN
+            IMAX = M
+         ELSE
+            IMAX = INFO - 1
+         END IF
+         CALL DSCAL( IMAX, ONE / SIGMA, W, 1 )
+      END IF
+*
+*     If eigenvalues are not in order, then sort them, along with
+*     eigenvectors.
+*
+      IF( WANTZ ) THEN
+         DO 60 J = 1, M - 1
+            I = 0
+            TMP1 = W( J )
+            DO 50 JJ = J + 1, M
+               IF( W( JJ ).LT.TMP1 ) THEN
+                  I = JJ
+                  TMP1 = W( JJ )
+               END IF
+   50       CONTINUE
+*
+            IF( I.NE.0 ) THEN
+               ITMP1 = IWORK( INDIBL+I-1 )
+               W( I ) = W( J )
+               IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 )
+               W( J ) = TMP1
+               IWORK( INDIBL+J-1 ) = ITMP1
+               CALL ZSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 )
+               IF( INFO.NE.0 ) THEN
+                  ITMP1 = IFAIL( I )
+                  IFAIL( I ) = IFAIL( J )
+                  IFAIL( J ) = ITMP1
+               END IF
+            END IF
+   60    CONTINUE
+      END IF
+*
+*     Set WORK(1) to optimal complex workspace size.
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of ZHEEVX_2STAGE
+*
+      END
diff --git a/SRC/zhegv_2stage.f b/SRC/zhegv_2stage.f
new file mode 100644
index 00000000..5079d240
--- /dev/null
+++ b/SRC/zhegv_2stage.f
@@ -0,0 +1,379 @@
+*> \brief \b ZHEGV_2STAGE
+*
+*  @precisions fortran z -> c
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHEGV_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhegv_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhegv_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhegv_2stage.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZHEGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+*                                WORK, LWORK, RWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          JOBZ, UPLO
+*       INTEGER            INFO, ITYPE, LDA, LDB, LWORK, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   RWORK( * ), W( * )
+*       COMPLEX*16         A( LDA, * ), B( LDB, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZHEGV_2STAGE computes all the eigenvalues, and optionally, the eigenvectors
+*> of a complex generalized Hermitian-definite eigenproblem, of the form
+*> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+*> Here A and B are assumed to be Hermitian and B is also
+*> positive definite.
+*> This routine use the 2stage technique for the reduction to tridiagonal
+*> which showed higher performance on recent architecture and for large
+*  sizes N>2000.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] ITYPE
+*> \verbatim
+*>          ITYPE is INTEGER
+*>          Specifies the problem type to be solved:
+*>          = 1:  A*x = (lambda)*B*x
+*>          = 2:  A*B*x = (lambda)*x
+*>          = 3:  B*A*x = (lambda)*x
+*> \endverbatim
+*>
+*> \param[in] JOBZ
+*> \verbatim
+*>          JOBZ is CHARACTER*1
+*>          = 'N':  Compute eigenvalues only;
+*>          = 'V':  Compute eigenvalues and eigenvectors.
+*>                  Not available in this release.
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangles of A and B are stored;
+*>          = 'L':  Lower triangles of A and B are stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrices A and B.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA, N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the
+*>          leading N-by-N upper triangular part of A contains the
+*>          upper triangular part of the matrix A.  If UPLO = 'L',
+*>          the leading N-by-N lower triangular part of A contains
+*>          the lower triangular part of the matrix A.
+*>
+*>          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+*>          matrix Z of eigenvectors.  The eigenvectors are normalized
+*>          as follows:
+*>          if ITYPE = 1 or 2, Z**H*B*Z = I;
+*>          if ITYPE = 3, Z**H*inv(B)*Z = I.
+*>          If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
+*>          or the lower triangle (if UPLO='L') of A, including the
+*>          diagonal, is destroyed.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (LDB, N)
+*>          On entry, the Hermitian positive definite matrix B.
+*>          If UPLO = 'U', the leading N-by-N upper triangular part of B
+*>          contains the upper triangular part of the matrix B.
+*>          If UPLO = 'L', the leading N-by-N lower triangular part of B
+*>          contains the lower triangular part of the matrix B.
+*>
+*>          On exit, if INFO <= N, the part of B containing the matrix is
+*>          overwritten by the triangular factor U or L from the Cholesky
+*>          factorization B = U**H*U or B = L*L**H.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] W
+*> \verbatim
+*>          W is DOUBLE PRECISION array, dimension (N)
+*>          If INFO = 0, the eigenvalues in ascending order.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The length of the array WORK. LWORK >= 1, when N <= 1;
+*>          otherwise  
+*>          If JOBZ = 'N' and N > 1, LWORK must be queried.
+*>                                   LWORK = MAX(1, dimension) where
+*>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                               + (KD+1)*N + N
+*>                                   where KD is the blocking size of the reduction,
+*>                                   FACTOPTNB is the blocking used by the QR or LQ
+*>                                   algorithm, usually FACTOPTNB=128 is a good choice
+*>                                   NTHREADS is the number of threads used when
+*>                                   openMP compilation is enabled, otherwise =1.
+*>          If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (max(1, 3*N-2))
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  ZPOTRF or ZHEEV returned an error code:
+*>             <= N:  if INFO = i, ZHEEV failed to converge;
+*>                    i off-diagonal elements of an intermediate
+*>                    tridiagonal form did not converge to zero;
+*>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
+*>                    minor of order i of B is not positive definite.
+*>                    The factorization of B could not be completed and
+*>                    no eigenvalues or eigenvectors were computed.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16HEeigen
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  All details about the 2stage techniques are available in:
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZHEGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W,
+     $                         WORK, LWORK, RWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK driver routine (version 3.6.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          JOBZ, UPLO
+      INTEGER            INFO, ITYPE, LDA, LDB, LWORK, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   RWORK( * ), W( * )
+      COMPLEX*16         A( LDA, * ), B( LDB, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER          ( ONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, UPPER, WANTZ
+      CHARACTER          TRANS
+      INTEGER            NEIG, LWMIN, LHTRD, LWTRD, KD, IB
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, ZHEGST, ZPOTRF, ZTRMM, ZTRSM,
+     $                   ZHEEV_2STAGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      WANTZ = LSAME( JOBZ, 'V' )
+      UPPER = LSAME( UPLO, 'U' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+      INFO = 0
+      IF( ITYPE.LT.1 .OR. ITYPE.GT.3 ) THEN
+         INFO = -1
+      ELSE IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.( UPPER .OR. LSAME( UPLO, 'L' ) ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         KD    = ILAENV( 17, 'ZHETRD_2STAGE', JOBZ, N, -1, -1, -1 )
+         IB    = ILAENV( 18, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
+         LHTRD = ILAENV( 19, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWTRD = ILAENV( 20, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
+         LWMIN = N + LHTRD + LWTRD
+         WORK( 1 )  = LWMIN
+*
+         IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHEGV_2STAGE ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+*     Form a Cholesky factorization of B.
+*
+      CALL ZPOTRF( UPLO, N, B, LDB, INFO )
+      IF( INFO.NE.0 ) THEN
+         INFO = N + INFO
+         RETURN
+      END IF
+*
+*     Transform problem to standard eigenvalue problem and solve.
+*
+      CALL ZHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
+      CALL ZHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, 
+     $                   WORK, LWORK, RWORK, INFO )
+*
+      IF( WANTZ ) THEN
+*
+*        Backtransform eigenvectors to the original problem.
+*
+         NEIG = N
+         IF( INFO.GT.0 )
+     $      NEIG = INFO - 1
+         IF( ITYPE.EQ.1 .OR. ITYPE.EQ.2 ) THEN
+*
+*           For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
+*           backtransform eigenvectors: x = inv(L)**H *y or inv(U)*y
+*
+            IF( UPPER ) THEN
+               TRANS = 'N'
+            ELSE
+               TRANS = 'C'
+            END IF
+*
+            CALL ZTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+     $                  B, LDB, A, LDA )
+*
+         ELSE IF( ITYPE.EQ.3 ) THEN
+*
+*           For B*A*x=(lambda)*x;
+*           backtransform eigenvectors: x = L*y or U**H *y
+*
+            IF( UPPER ) THEN
+               TRANS = 'C'
+            ELSE
+               TRANS = 'N'
+            END IF
+*
+            CALL ZTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE,
+     $                  B, LDB, A, LDA )
+         END IF
+      END IF
+*
+      WORK( 1 ) = LWMIN
+*
+      RETURN
+*
+*     End of ZHEGV_2STAGE
+*
+      END
diff --git a/SRC/zhetrd_2stage.f b/SRC/zhetrd_2stage.f
new file mode 100644
index 00000000..62fd7539
--- /dev/null
+++ b/SRC/zhetrd_2stage.f
@@ -0,0 +1,337 @@
+*> \brief \b ZHETRD_2STAGE
+*
+*  @precisions fortran z -> s d c
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ZHETRD_2STAGE + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhetrd_2stage.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhetrd_2stage.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhetrd_2stage.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       CHARACTER          VECT, UPLO
+*       INTEGER            N, LDA, LWORK, LHOUS2, INFO
+*      ..
+*      .. Array Arguments ..
+*       DOUBLE PRECISION   D( * ), E( * )
+*       COMPLEX*16         A( LDA, * ), TAU( * ),
+*                          HOUS2( * ), WORK( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZHETRD_2STAGE reduces a complex Hermitian matrix A to real symmetric
+*> tridiagonal form T by a unitary similarity transformation:
+*> Q1**H Q2**H* A * Q2 * Q1 = T.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] VECT
+*> \verbatim
+*>          VECT is CHARACTER*1
+*>          = 'N':  No need for the Housholder representation, 
+*>                  in particular for the second stage (Band to
+*>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
+*>          = 'V':  the Householder representation is needed to 
+*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>                  then LHOUS2 is to be queried and computed.
+*>                  (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+*>          N-by-N upper triangular part of A contains the upper
+*>          triangular part of the matrix A, and the strictly lower
+*>          triangular part of A is not referenced.  If UPLO = 'L', the
+*>          leading N-by-N lower triangular part of A contains the lower
+*>          triangular part of the matrix A, and the strictly upper
+*>          triangular part of A is not referenced.
+*>          On exit, if UPLO = 'U', the band superdiagonal
+*>          of A are overwritten by the corresponding elements of the
+*>          internal band-diagonal matrix AB, and the elements above 
+*>          the KD superdiagonal, with the array TAU, represent the unitary
+*>          matrix Q1 as a product of elementary reflectors; if UPLO
+*>          = 'L', the diagonal and band subdiagonal of A are over-
+*>          written by the corresponding elements of the internal band-diagonal
+*>          matrix AB, and the elements below the KD subdiagonal, with
+*>          the array TAU, represent the unitary matrix Q1 as a product
+*>          of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is DOUBLE PRECISION array, dimension (N)
+*>          The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*>          E is DOUBLE PRECISION array, dimension (N-1)
+*>          The off-diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (N-KD)
+*>          The scalar factors of the elementary reflectors of 
+*>          the first stage (see Further Details).
+*> \endverbatim
+*>
+*> \param[out] HOUS2
+*> \verbatim
+*>          HOUS2 is COMPLEX*16 array, dimension LHOUS2, that
+*>          store the Householder representation of the stage2
+*>          band to tridiagonal.
+*> \endverbatim
+*>
+*> \param[in] LHOUS2
+*> \verbatim
+*>          LHOUS2 is INTEGER
+*>          The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS2=-1,
+*>          then a query is assumed; the routine
+*>          only calculates the optimal size of the HOUS2 array, returns
+*>          this value as the first entry of the HOUS2 array, and no error
+*>          message related to LHOUS2 is issued by XERBLA.
+*>          LHOUS2 = MAX(1, dimension) where
+*>          dimension = 4*N if VECT='N'
+*>          not available now if VECT='H'
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS2=-1,
+*>          then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>          LWORK = MAX(1, dimension) where
+*>          dimension   = max(stage1,stage2) + (KD+1)*N
+*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
+*>                        + max(2*KD*KD, KD*NTHREADS) 
+*>                        + (KD+1)*N 
+*>          where KD is the blocking size of the reduction,
+*>          FACTOPTNB is the blocking used by the QR or LQ
+*>          algorithm, usually FACTOPTNB=128 is a good choice
+*>          NTHREADS is the number of threads used when
+*>          openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2016
+*
+*> \ingroup complex16HEcomputational
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+     $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          VECT, UPLO
+      INTEGER            N, LDA, LWORK, LHOUS2, INFO
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   D( * ), E( * )
+      COMPLEX*16         A( LDA, * ), TAU( * ),
+     $                   HOUS2( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, UPPER, WANTQ
+      INTEGER            KD, IB, LWMIN, LHMIN, LWRK, LDAB, WPOS, ABPOS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, ZHETRD_HE2HB, ZHETRD_HB2ST
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO   = 0
+      WANTQ  = LSAME( VECT, 'V' )
+      UPPER  = LSAME( UPLO, 'U' )
+      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS2.EQ.-1 )
+*
+*     Determine the block size, the workspace size and the hous size.
+*
+      KD     = ILAENV( 17, 'ZHETRD_2STAGE', VECT, N, -1, -1, -1 )
+      IB     = ILAENV( 18, 'ZHETRD_2STAGE', VECT, N, KD, -1, -1 )
+      LHMIN  = ILAENV( 19, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
+      LWMIN  = ILAENV( 20, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
+*      WRITE(*,*),'ZHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
+*     $            LHMIN, LWMIN
+*
+      IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      ELSE IF( LHOUS2.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -12
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         HOUS2( 1 ) = LHMIN
+         WORK( 1 )  = LWMIN
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHETRD_2STAGE', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 ) THEN
+         WORK( 1 ) = 1
+         RETURN
+      END IF
+*
+*     Determine pointer position
+*
+      LDAB  = KD+1
+      LWRK  = LWORK-LDAB*N
+      ABPOS = 1
+      WPOS  = ABPOS + LDAB*N
+      CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+     $                   TAU, WORK( WPOS ), LWRK, INFO )
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHETRD_HE2HB', -INFO )
+         RETURN
+      END IF
+      CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, 
+     $                   WORK( ABPOS ), LDAB, D, E, 
+     $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHETRD_HB2ST', -INFO )
+         RETURN
+      END IF
+*
+*
+      HOUS2( 1 ) = LHMIN
+      WORK( 1 )  = LWMIN
+      RETURN
+*
+*     End of ZHETRD_2STAGE
+*
+      END
diff --git a/SRC/zhetrd_hb2st.F b/SRC/zhetrd_hb2st.F
new file mode 100644
index 00000000..5d62e30d
--- /dev/null
+++ b/SRC/zhetrd_hb2st.F
@@ -0,0 +1,603 @@
+*> \brief \b ZHBTRD
+*
+*  @precisions fortran z -> s d c
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZHBTRD + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhbtrd.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhbtrd.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhbtrd.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+*       #define PRECISION_COMPLEX
+*
+*       #if defined(_OPENMP)
+*       use omp_lib
+*       #endif
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          STAGE1, UPLO, VECT
+*       INTEGER            N, KD, IB, LDAB, LHOUS, LWORK, INFO
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   D( * ), E( * )
+*       COMPLEX*16         AB( LDAB, * ), HOUS( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZHBTRD reduces a complex Hermitian band matrix A to real symmetric
+*> tridiagonal form T by a unitary similarity transformation:
+*> Q**H * A * Q = T.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] STAGE
+*> \verbatim
+*>          STAGE is CHARACTER*1
+*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>                  from dense to band using the zhetrd_he2hb routine
+*>                  was not called before this routine to reproduce AB. 
+*>                  In other term this routine is called as standalone. 
+*>          = 'Y':  "Yes": to mention that the stage 1 of the 
+*>                  reduction from dense to band using the zhetrd_he2hb 
+*>                  routine has been called to produce AB (e.g., AB is
+*>                  the output of zhetrd_he2hb.
+*> \endverbatim
+*>
+*> \param[in] VECT
+*> \verbatim
+*>          VECT is CHARACTER*1
+*>          = 'N':  No need for the Housholder representation, 
+*>                  and thus LHOUS is of size max(1, 4*N);
+*>          = 'V':  the Householder representation is needed to 
+*>                  either generate or to apply Q later on, 
+*>                  then LHOUS is to be queried and computed.
+*>                  (NOT AVAILABLE IN THIS RELEASE).
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the matrix A if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX*16 array, dimension (LDAB,N)
+*>          On entry, the upper or lower triangle of the Hermitian band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*>          On exit, the diagonal elements of AB are overwritten by the
+*>          diagonal elements of the tridiagonal matrix T; if KD > 0, the
+*>          elements on the first superdiagonal (if UPLO = 'U') or the
+*>          first subdiagonal (if UPLO = 'L') are overwritten by the
+*>          off-diagonal elements of T; the rest of AB is overwritten by
+*>          values generated during the reduction.
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is DOUBLE PRECISION array, dimension (N)
+*>          The diagonal elements of the tridiagonal matrix T.
+*> \endverbatim
+*>
+*> \param[out] E
+*> \verbatim
+*>          E is DOUBLE PRECISION array, dimension (N-1)
+*>          The off-diagonal elements of the tridiagonal matrix T:
+*>          E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'.
+*> \endverbatim
+*>
+*> \param[out] HOUS
+*> \verbatim
+*>          HOUS is COMPLEX*16 array, dimension LHOUS, that
+*>          store the Householder representation.
+*> \endverbatim
+*>
+*> \param[in] LHOUS
+*> \verbatim
+*>          LHOUS is INTEGER
+*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS=-1,
+*>          then a query is assumed; the routine
+*>          only calculates the optimal size of the HOUS array, returns
+*>          this value as the first entry of the HOUS array, and no error
+*>          message related to LHOUS is issued by XERBLA.
+*>          LHOUS = MAX(1, dimension) where
+*>          dimension = 4*N if VECT='N'
+*>          not available now if VECT='H'     
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
+*>          If LWORK = -1, or LHOUS=-1,
+*>          then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>          LWORK = MAX(1, dimension) where
+*>          dimension   = (2KD+1)*N + KD*NTHREADS
+*>          where KD is the blocking size of the reduction,
+*>          FACTOPTNB is the blocking used by the QR or LQ
+*>          algorithm, usually FACTOPTNB=128 is a good choice
+*>          NTHREADS is the number of threads used when
+*>          openMP compilation is enabled, otherwise =1.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2016
+*
+*> \ingroup complex16OTHERcomputational
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+     $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
+*
+#define PRECISION_COMPLEX
+*
+#if defined(_OPENMP)
+      use omp_lib
+#endif
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          STAGE1, UPLO, VECT
+      INTEGER            N, KD, LDAB, LHOUS, LWORK, INFO
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   D( * ), E( * )
+      COMPLEX*16         AB( LDAB, * ), HOUS( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   RZERO
+      COMPLEX*16         ZERO, ONE
+      PARAMETER          ( RZERO = 0.0D+0,
+     $                   ZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   ONE  = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+     $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
+     $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
+     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
+     $                   SIZEV, SIZETAU, LDV, LHMIN, LWMIN
+#if defined (PRECISION_COMPLEX)
+      DOUBLE PRECISION   ABSTMP
+      COMPLEX*16         TMP
+#endif
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZHB2ST_KERNELS, ZLACPY, ZLASET
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MIN, MAX, CEILING, DBLE, REAL
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV 
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Determine the minimal workspace size required.
+*     Test the input parameters
+*
+      DEBUG   = 0
+      INFO    = 0
+      AFTERS1 = LSAME( STAGE1, 'Y' )
+      WANTQ   = LSAME( VECT, 'V' )
+      UPPER   = LSAME( UPLO, 'U' )
+      LQUERY  = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 )
+*
+*     Determine the block size, the workspace size and the hous size.
+*
+      IB     = ILAENV( 18, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
+      LHMIN  = ILAENV( 19, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      LWMIN  = ILAENV( 20, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+*
+      IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -3
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( LDAB.LT.(KD+1) ) THEN
+         INFO = -7
+      ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -11
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -13
+      END IF
+*
+      IF( INFO.EQ.0 ) THEN
+         HOUS( 1 ) = LHMIN
+         WORK( 1 ) = LWMIN
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHETRD_HB2ST', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 ) THEN
+         WORK( 1 ) = 1
+         RETURN
+      END IF
+*
+*     Determine pointer position
+*
+      LDV      = KD + IB
+      SIZETAU  = 2 * N
+      SIZEV    = 2 * N
+      INDTAU   = 1
+      INDV     = INDTAU + SIZETAU
+      LDA      = 2 * KD + 1
+      SIZEA    = LDA * N
+      INDA     = 1
+      INDW     = INDA + SIZEA
+      NTHREADS = 1
+      TID      = 0
+*
+      IF( UPPER ) THEN
+          APOS     = INDA + KD
+          AWPOS    = INDA
+          DPOS     = APOS + KD
+          OFDPOS   = DPOS - 1
+          ABDPOS   = KD + 1
+          ABOFDPOS = KD
+      ELSE
+          APOS     = INDA 
+          AWPOS    = INDA + KD + 1
+          DPOS     = APOS
+          OFDPOS   = DPOS + 1
+          ABDPOS   = 1
+          ABOFDPOS = 2
+
+      ENDIF
+*      
+*     Case KD=0: 
+*     The matrix is diagonal. We just copy it (convert to "real" for 
+*     complex because D is double and the imaginary part should be 0) 
+*     and store it in D. A sequential code here is better or 
+*     in a parallel environment it might need two cores for D and E
+*
+      IF( KD.EQ.0 ) THEN
+          DO 30 I = 1, N
+              D( I ) = DBLE( AB( ABDPOS, I ) )
+   30     CONTINUE
+          DO 40 I = 1, N-1
+              E( I ) = RZERO
+   40     CONTINUE
+         GOTO 200
+      END IF
+*      
+*     Case KD=1: 
+*     The matrix is already Tridiagonal. We have to make diagonal 
+*     and offdiagonal elements real, and store them in D and E.
+*     For that, for real precision just copy the diag and offdiag 
+*     to D and E while for the COMPLEX case the bulge chasing is  
+*     performed to convert the hermetian tridiagonal to symmetric 
+*     tridiagonal. A simpler coversion formula might be used, but then 
+*     updating the Q matrix will be required and based if Q is generated
+*     or not this might complicate the story. 
+*      
+C      IF( KD.EQ.1 .AND. N.GT.(KD+1) .AND. AFTERS1 ) THEN
+      IF( KD.EQ.1 ) THEN
+          DO 50 I = 1, N
+              D( I ) = DBLE( AB( ABDPOS, I ) )
+   50     CONTINUE
+#if defined (PRECISION_COMPLEX)
+*
+*         make off-diagonal elements real and copy them to E
+*
+          IF( UPPER ) THEN
+              DO 60 I = 1, N - 1
+                  TMP = AB( ABOFDPOS, I+1 )
+                  ABSTMP = ABS( TMP )
+                  AB( ABOFDPOS, I+1 ) = ABSTMP
+                  E( I ) = ABSTMP
+                  IF( ABSTMP.NE.RZERO ) THEN
+                     TMP = TMP / ABSTMP
+                  ELSE
+                     TMP = ONE
+                  END IF
+                  IF( I.LT.N-1 )
+     $               AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP
+C                  IF( WANTZ ) THEN
+C                     CALL ZSCAL( N, DCONJG( TMP ), Q( 1, I+1 ), 1 )
+C                  END IF
+   60         CONTINUE
+          ELSE
+              DO 70 I = 1, N - 1
+                 TMP = AB( ABOFDPOS, I )
+                 ABSTMP = ABS( TMP )
+                 AB( ABOFDPOS, I ) = ABSTMP
+                 E( I ) = ABSTMP
+                 IF( ABSTMP.NE.RZERO ) THEN
+                    TMP = TMP / ABSTMP
+                 ELSE
+                    TMP = ONE
+                 END IF
+                 IF( I.LT.N-1 )
+     $              AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP
+C                 IF( WANTQ ) THEN
+C                    CALL ZSCAL( N, TMP, Q( 1, I+1 ), 1 )
+C                 END IF
+   70         CONTINUE
+          ENDIF
+#else
+          IF( UPPER ) THEN
+              DO 60 I = 1, N-1
+                 E( I ) = DBLE( AB( ABOFDPOS, I+1 ) )
+   60         CONTINUE
+          ELSE
+              DO 70 I = 1, N-1
+                 E( I ) = DBLE( AB( ABOFDPOS, I ) )
+   70         CONTINUE
+          ENDIF
+#endif
+          GOTO 200
+      END IF
+*
+*     Main code start here. 
+*     Reduce the hermitian band of A to a tridiagonal matrix.
+*
+      THGRSIZ   = N
+      GRSIZ     = 1
+      SHIFT     = 3
+      NBTILES   = CEILING( REAL(N)/REAL(KD) )
+      STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
+      THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
+*      
+      CALL ZLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
+      CALL ZLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
+*
+*
+*     openMP parallelisation start here
+*
+#if defined(_OPENMP)
+!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
+!$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
+!$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
+!$OMP$         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+!$OMP MASTER
+#endif
+*
+*     main bulge chasing loop
+*      
+      DO 100 THGRID = 1, THGRNB
+          STT  = (THGRID-1)*THGRSIZ+1
+          THED = MIN( (STT + THGRSIZ -1), (N-1))
+          DO 110 I = STT, N-1
+              ED = MIN( I, THED )
+              IF( STT.GT.ED ) GOTO 100
+              DO 120 M = 1, STEPERCOL
+                  ST = STT
+                  DO 130 SWEEPID = ST, ED
+                      DO 140 K = 1, GRSIZ
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+     $                           + (M-1)*GRSIZ + K
+                          IF ( MYID.EQ.1 ) THEN
+                              TTYPE = 1
+                          ELSE
+                              TTYPE = MOD( MYID, 2 ) + 2
+                          ENDIF
+
+                          IF( TTYPE.EQ.2 ) THEN
+                              COLPT      = (MYID/2)*KD + SWEEPID
+                              STIND      = COLPT-KD+1
+                              EDIND      = MIN(COLPT,N)
+                              BLKLASTIND = COLPT
+                          ELSE
+                              COLPT      = ((MYID+1)/2)*KD + SWEEPID
+                              STIND      = COLPT-KD+1
+                              EDIND      = MIN(COLPT,N)
+                              IF( ( STIND.GE.EDIND-1 ).AND.
+     $                            ( EDIND.EQ.N ) ) THEN
+                                  BLKLASTIND = N
+                              ELSE
+                                  BLKLASTIND = 0
+                              ENDIF
+                          ENDIF
+*
+*                         Call the kernel
+*                             
+#if defined(_OPENMP)
+                          IF( TTYPE.NE.1 ) THEN      
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$     DEPEND(in:WORK(MYID-1))
+!$OMP$     DEPEND(out:WORK(MYID))
+                              TID      = OMP_GET_THREAD_NUM()
+                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+     $                             STIND, EDIND, SWEEPID, N, KD, IB,
+     $                             WORK ( INDA ), LDA, 
+     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
+     $                             WORK( INDW + TID*KD ) )
+!$OMP END TASK
+                          ELSE
+!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
+!$OMP$     DEPEND(out:WORK(MYID))
+                              TID      = OMP_GET_THREAD_NUM()
+                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+     $                             STIND, EDIND, SWEEPID, N, KD, IB,
+     $                             WORK ( INDA ), LDA, 
+     $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
+     $                             WORK( INDW + TID*KD ) )
+!$OMP END TASK
+                          ENDIF
+#else
+                          CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+     $                         STIND, EDIND, SWEEPID, N, KD, IB,
+     $                         WORK ( INDA ), LDA, 
+     $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
+     $                         WORK( INDW + TID*KD ) )
+#endif 
+                          IF ( BLKLASTIND.GE.(N-1) ) THEN
+                              STT = STT + 1
+                              GOTO 130
+                          ENDIF
+  140                 CONTINUE
+  130             CONTINUE
+  120         CONTINUE
+  110     CONTINUE
+  100 CONTINUE
+*
+#if defined(_OPENMP)
+!$OMP END MASTER
+!$OMP END PARALLEL
+#endif
+*      
+*     Copy the diagonal from A to D. Note that D is REAL thus only
+*     the Real part is needed, the imaginary part should be zero.
+*
+      DO 150 I = 1, N
+          D( I ) = DBLE( WORK( DPOS+(I-1)*LDA ) )
+  150 CONTINUE
+*      
+*     Copy the off diagonal from A to E. Note that E is REAL thus only
+*     the Real part is needed, the imaginary part should be zero.
+*
+      IF( UPPER ) THEN
+          DO 160 I = 1, N-1
+             E( I ) = DBLE( WORK( OFDPOS+I*LDA ) )
+  160     CONTINUE
+      ELSE
+          DO 170 I = 1, N-1
+             E( I ) = DBLE( WORK( OFDPOS+(I-1)*LDA ) )
+  170     CONTINUE
+      ENDIF
+*
+  200 CONTINUE  
+*
+      HOUS( 1 ) = LHMIN
+      WORK( 1 ) = LWMIN
+      RETURN
+*
+*     End of ZHETRD_HB2ST
+*
+      END
+#undef PRECISION_COMPLEX
+      
diff --git a/SRC/zhetrd_he2hb.f b/SRC/zhetrd_he2hb.f
new file mode 100644
index 00000000..9403b73e
--- /dev/null
+++ b/SRC/zhetrd_he2hb.f
@@ -0,0 +1,517 @@
+*> \brief \b ZHETRD_HE2HB
+*
+*  @precisions fortran z -> s d c
+*      
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ZHETRD_HE2HB + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhetrd.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhetrd.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhetrd.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZHETRD_HE2HB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, 
+*                              WORK, LWORK, INFO )
+*
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          UPLO
+*       INTEGER            INFO, LDA, LDAB, LWORK, N, KD
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         A( LDA, * ), AB( LDAB, * ), 
+*                          TAU( * ), WORK( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZHETRD_HE2HB reduces a complex Hermitian matrix A to complex Hermitian
+*> band-diagonal form AB by a unitary similarity transformation:
+*> Q**H * A * Q = AB.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          = 'U':  Upper triangle of A is stored;
+*>          = 'L':  Lower triangle of A is stored.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KD
+*> \verbatim
+*>          KD is INTEGER
+*>          The number of superdiagonals of the reduced matrix if UPLO = 'U',
+*>          or the number of subdiagonals if UPLO = 'L'.  KD >= 0.
+*>          The reduced matrix is stored in the array AB.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+*>          N-by-N upper triangular part of A contains the upper
+*>          triangular part of the matrix A, and the strictly lower
+*>          triangular part of A is not referenced.  If UPLO = 'L', the
+*>          leading N-by-N lower triangular part of A contains the lower
+*>          triangular part of the matrix A, and the strictly upper
+*>          triangular part of A is not referenced.
+*>          On exit, if UPLO = 'U', the diagonal and first superdiagonal
+*>          of A are overwritten by the corresponding elements of the
+*>          tridiagonal matrix T, and the elements above the first
+*>          superdiagonal, with the array TAU, represent the unitary
+*>          matrix Q as a product of elementary reflectors; if UPLO
+*>          = 'L', the diagonal and first subdiagonal of A are over-
+*>          written by the corresponding elements of the tridiagonal
+*>          matrix T, and the elements below the first subdiagonal, with
+*>          the array TAU, represent the unitary matrix Q as a product
+*>          of elementary reflectors. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] AB
+*> \verbatim
+*>          AB is COMPLEX*16 array, dimension (LDAB,N)
+*>          On exit, the upper or lower triangle of the Hermitian band
+*>          matrix A, stored in the first KD+1 rows of the array.  The
+*>          j-th column of A is stored in the j-th column of the array AB
+*>          as follows:
+*>          if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j;
+*>          if UPLO = 'L', AB(1+i-j,j)    = A(i,j) for j<=i<=min(n,j+kd).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KD+1.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (N-KD)
+*>          The scalar factors of the elementary reflectors (see Further
+*>          Details).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension LWORK.
+*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK(1) returns the size of LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK which should be calculated
+*           by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>          LWORK_QUERY = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD
+*>          where FACTOPTNB is the blocking used by the QR or LQ
+*>          algorithm, usually FACTOPTNB=128 is a good choice otherwise
+*>          putting LWORK=-1 will provide the size of WORK.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2016
+*
+*> \ingroup complex16HEcomputational
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Implemented by Azzam Haidar.
+*>
+*>  All details are available on technical report, SC11, SC13 papers.
+*>
+*>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
+*>  Parallel reduction to condensed forms for symmetric eigenvalue problems
+*>  using aggregated fine-grained and memory-aware kernels. In Proceedings
+*>  of 2011 International Conference for High Performance Computing,
+*>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
+*>  Article 8 , 11 pages.
+*>  http://doi.acm.org/10.1145/2063384.2063394
+*>
+*>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
+*>  An improved parallel singular value algorithm and its implementation 
+*>  for multicore hardware, In Proceedings of 2013 International Conference
+*>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
+*>  Denver, Colorado, USA, 2013.
+*>  Article 90, 12 pages.
+*>  http://doi.acm.org/10.1145/2503210.2503292
+*>
+*>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  calculations based on fine-grained memory aware tasks.
+*>  International Journal of High Performance Computing Applications.
+*>  Volume 28 Issue 2, Pages 196-209, May 2014.
+*>  http://hpc.sagepub.com/content/28/2/196 
+*>
+*> \endverbatim
+*>
+*> \verbatim
+*>
+*>  If UPLO = 'U', the matrix Q is represented as a product of elementary
+*>  reflectors
+*>
+*>     Q = H(k)**H . . . H(2)**H H(1)**H, where k = n-kd.
+*>
+*>  Each H(i) has the form
+*>
+*>     H(i) = I - tau * v * v**H
+*>
+*>  where tau is a complex scalar, and v is a complex vector with
+*>  v(1:i+kd-1) = 0 and v(i+kd) = 1; conjg(v(i+kd+1:n)) is stored on exit in
+*>  A(i,i+kd+1:n), and tau in TAU(i).
+*>
+*>  If UPLO = 'L', the matrix Q is represented as a product of elementary
+*>  reflectors
+*>
+*>     Q = H(1) H(2) . . . H(k), where k = n-kd.
+*>
+*>  Each H(i) has the form
+*>
+*>     H(i) = I - tau * v * v**H
+*>
+*>  where tau is a complex scalar, and v is a complex vector with
+*>  v(kd+1:i) = 0 and v(i+kd+1) = 1; v(i+kd+2:n) is stored on exit in
+*   A(i+kd+2:n,i), and tau in TAU(i).
+*>
+*>  The contents of A on exit are illustrated by the following examples
+*>  with n = 5:
+*>
+*>  if UPLO = 'U':                       if UPLO = 'L':
+*>
+*>    (  ab  ab/v1  v1      v1     v1    )              (  ab                            )
+*>    (      ab     ab/v2   v2     v2    )              (  ab/v1  ab                     )
+*>    (             ab      ab/v3  v3    )              (  v1     ab/v2  ab              )
+*>    (                     ab     ab/v4 )              (  v1     v2     ab/v3  ab       )
+*>    (                            ab    )              (  v1     v2     v3     ab/v4 ab )
+*>
+*>  where d and e denote diagonal and off-diagonal elements of T, and vi
+*>  denotes an element of the vector defining H(i).
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZHETRD_HE2HB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, 
+     $                         WORK, LWORK, INFO )
+*
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER          UPLO
+      INTEGER            INFO, LDA, LDAB, LWORK, N, KD
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), AB( LDAB, * ), 
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   RONE
+      COMPLEX*16         ZERO, ONE, HALF
+      PARAMETER          ( RONE = 1.0D+0,
+     $                   ZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   ONE = ( 1.0D+0, 0.0D+0 ),
+     $                   HALF = ( 0.5D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, UPPER
+      INTEGER            I, J, IINFO, LWMIN, PN, PK, LK,
+     $                   LDT, LDW, LDS2, LDS1, 
+     $                   LS2, LS1, LW, LT,
+     $                   TPOS, WPOS, S2POS, S1POS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, ZHER2K, ZHEMM, ZGEMM,
+     $                   ZLARFT, ZGELQF, ZGEQRF, ZLASET
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MIN, MAX
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV 
+      EXTERNAL           LSAME, ILAENV
+*     ..
+*     .. Executable Statements ..
+*
+*     Determine the minimal workspace size required 
+*     and test the input parameters
+*
+      INFO   = 0
+      UPPER  = LSAME( UPLO, 'U' )
+      LQUERY = ( LWORK.EQ.-1 )
+      LWMIN  = ILAENV( 20, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
+      
+      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( KD.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -5
+      ELSE IF( LDAB.LT.MAX( 1, KD+1 ) ) THEN
+         INFO = -7
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHETRD_HE2HB', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         WORK( 1 ) = LWMIN
+         RETURN
+      END IF
+*
+*     Quick return if possible        
+*     Copy the upper/lower portion of A into AB 
+*
+      IF( N.LE.KD+1 ) THEN
+          IF( UPPER ) THEN
+              DO 100 I = 1, N
+                  LK = MIN( KD+1, I )
+                  CALL ZCOPY( LK, A( I-LK+1, I ), 1, 
+     $                            AB( KD+1-LK+1, I ), 1 )
+  100         CONTINUE
+          ELSE
+              DO 110 I = 1, N
+                  LK = MIN( KD+1, N-I+1 )
+                  CALL ZCOPY( LK, A( I, I ), 1, AB( 1, I ), 1 )
+  110         CONTINUE
+          ENDIF
+          WORK( 1 ) = 1
+          RETURN
+      END IF
+*
+*     Determine the pointer position for the workspace
+*      
+      LDT    = KD
+      LDS1   = KD
+      LT     = LDT*KD
+      LW     = N*KD
+      LS1    = LDS1*KD
+      LS2    = LWMIN - LT - LW - LS1
+*      LS2 = N*MAX(KD,FACTOPTNB) 
+      TPOS   = 1
+      WPOS   = TPOS  + LT
+      S1POS  = WPOS  + LW
+      S2POS  = S1POS + LS1 
+      IF( UPPER ) THEN
+          LDW    = KD
+          LDS2   = KD
+      ELSE
+          LDW    = N
+          LDS2   = N
+      ENDIF
+*
+*
+*     Set the workspace of the triangular matrix T to zero once such a
+*     way everytime T is generated the upper/lower portion will be always zero  
+*   
+      CALL ZLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT )
+*
+      IF( UPPER ) THEN
+          DO 10 I = 1, N - KD, KD
+             PN = N-I-KD+1
+             PK = MIN( N-I-KD+1, KD )
+*        
+*            Compute the LQ factorization of the current block
+*        
+             CALL ZGELQF( KD, PN, A( I, I+KD ), LDA,
+     $                    TAU( I ), WORK( S2POS ), LS2, IINFO )
+*        
+*            Copy the upper portion of A into AB
+*        
+             DO 20 J = I, I+PK-1
+                LK = MIN( KD, N-J ) + 1
+                CALL ZCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+   20        CONTINUE
+*                
+             CALL ZLASET( 'Lower', PK, PK, ZERO, ONE, 
+     $                    A( I, I+KD ), LDA )
+*        
+*            Form the matrix T
+*        
+             CALL ZLARFT( 'Forward', 'Rowwise', PN, PK,
+     $                    A( I, I+KD ), LDA, TAU( I ), 
+     $                    WORK( TPOS ), LDT )
+*        
+*            Compute W:
+*             
+             CALL ZGEMM( 'Conjugate', 'No transpose', PK, PN, PK,
+     $                   ONE,  WORK( TPOS ), LDT,
+     $                         A( I, I+KD ), LDA,
+     $                   ZERO, WORK( S2POS ), LDS2 )
+*        
+             CALL ZHEMM( 'Right', UPLO, PK, PN,
+     $                   ONE,  A( I+KD, I+KD ), LDA,
+     $                         WORK( S2POS ), LDS2,
+     $                   ZERO, WORK( WPOS ), LDW )
+*        
+             CALL ZGEMM( 'No transpose', 'Conjugate', PK, PK, PN,
+     $                   ONE,  WORK( WPOS ), LDW,
+     $                         WORK( S2POS ), LDS2,
+     $                   ZERO, WORK( S1POS ), LDS1 )
+*        
+             CALL ZGEMM( 'No transpose', 'No transpose', PK, PN, PK,
+     $                   -HALF, WORK( S1POS ), LDS1, 
+     $                          A( I, I+KD ), LDA,
+     $                   ONE,   WORK( WPOS ), LDW )
+*             
+*        
+*            Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+*            an update of the form:  A := A - V'*W - W'*V
+*        
+             CALL ZHER2K( UPLO, 'Conjugate', PN, PK,
+     $                    -ONE, A( I, I+KD ), LDA,
+     $                          WORK( WPOS ), LDW,
+     $                    RONE, A( I+KD, I+KD ), LDA )
+   10     CONTINUE
+*
+*        Copy the upper band to AB which is the band storage matrix
+*
+         DO 30 J = N-KD+1, N
+            LK = MIN(KD, N-J) + 1
+            CALL ZCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 )
+   30    CONTINUE
+*
+      ELSE
+*
+*         Reduce the lower triangle of A to lower band matrix
+*        
+          DO 40 I = 1, N - KD, KD
+             PN = N-I-KD+1
+             PK = MIN( N-I-KD+1, KD )
+*        
+*            Compute the QR factorization of the current block
+*        
+             CALL ZGEQRF( PN, KD, A( I+KD, I ), LDA,
+     $                    TAU( I ), WORK( S2POS ), LS2, IINFO )
+*        
+*            Copy the upper portion of A into AB 
+*        
+             DO 50 J = I, I+PK-1
+                LK = MIN( KD, N-J ) + 1
+                CALL ZCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+   50        CONTINUE
+*                
+             CALL ZLASET( 'Upper', PK, PK, ZERO, ONE, 
+     $                    A( I+KD, I ), LDA )
+*        
+*            Form the matrix T
+*        
+             CALL ZLARFT( 'Forward', 'Columnwise', PN, PK,
+     $                    A( I+KD, I ), LDA, TAU( I ), 
+     $                    WORK( TPOS ), LDT )
+*        
+*            Compute W:
+*             
+             CALL ZGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+     $                   ONE, A( I+KD, I ), LDA,
+     $                         WORK( TPOS ), LDT,
+     $                   ZERO, WORK( S2POS ), LDS2 )
+*        
+             CALL ZHEMM( 'Left', UPLO, PN, PK,
+     $                   ONE, A( I+KD, I+KD ), LDA,
+     $                         WORK( S2POS ), LDS2,
+     $                   ZERO, WORK( WPOS ), LDW )
+*        
+             CALL ZGEMM( 'Conjugate', 'No transpose', PK, PK, PN,
+     $                   ONE, WORK( S2POS ), LDS2,
+     $                         WORK( WPOS ), LDW,
+     $                   ZERO, WORK( S1POS ), LDS1 )
+*        
+             CALL ZGEMM( 'No transpose', 'No transpose', PN, PK, PK,
+     $                   -HALF, A( I+KD, I ), LDA,
+     $                         WORK( S1POS ), LDS1,
+     $                   ONE, WORK( WPOS ), LDW )
+*             
+*        
+*            Update the unreduced submatrix A(i+kd:n,i+kd:n), using
+*            an update of the form:  A := A - V*W' - W*V'
+*        
+             CALL ZHER2K( UPLO, 'No transpose', PN, PK,
+     $                    -ONE, A( I+KD, I ), LDA,
+     $                           WORK( WPOS ), LDW,
+     $                    RONE, A( I+KD, I+KD ), LDA )
+*            ==================================================================
+*            RESTORE A FOR COMPARISON AND CHECKING TO BE REMOVED
+*             DO 45 J = I, I+PK-1
+*                LK = MIN( KD, N-J ) + 1
+*                CALL ZCOPY( LK, AB( 1, J ), 1, A( J, J ), 1 )
+*   45        CONTINUE
+*            ==================================================================
+   40     CONTINUE
+*
+*        Copy the lower band to AB which is the band storage matrix
+*
+         DO 60 J = N-KD+1, N
+            LK = MIN(KD, N-J) + 1
+            CALL ZCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 )
+   60    CONTINUE
+
+      END IF
+*
+      WORK( 1 ) = LWMIN
+      RETURN
+*
+*     End of ZHETRD_HE2HB
+*
+      END
diff --git a/SRC/zlarfy.f b/SRC/zlarfy.f
new file mode 100644
index 00000000..39b795f0
--- /dev/null
+++ b/SRC/zlarfy.f
@@ -0,0 +1,163 @@
+*> \brief \b ZLARFY
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          UPLO
+*       INTEGER            INCV, LDC, N
+*       COMPLEX*16         TAU
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         C( LDC, * ), V( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLARFY applies an elementary reflector, or Householder matrix, H,
+*> to an n x n Hermitian matrix C, from both the left and the right.
+*>
+*> H is represented in the form
+*>
+*>    H = I - tau * v * v'
+*>
+*> where  tau  is a scalar and  v  is a vector.
+*>
+*> If  tau  is  zero, then  H  is taken to be the unit matrix.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          Specifies whether the upper or lower triangular part of the
+*>          Hermitian matrix C is stored.
+*>          = 'U':  Upper triangle
+*>          = 'L':  Lower triangle
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of rows and columns of the matrix C.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is COMPLEX*16 array, dimension
+*>                  (1 + (N-1)*abs(INCV))
+*>          The vector v as described above.
+*> \endverbatim
+*>
+*> \param[in] INCV
+*> \verbatim
+*>          INCV is INTEGER
+*>          The increment between successive elements of v.  INCV must
+*>          not be zero.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16
+*>          The value tau as described above.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX*16 array, dimension (LDC, N)
+*>          On entry, the matrix C.
+*>          On exit, C is overwritten by H * C * H'.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C.  LDC >= max( 1, N ).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (N)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2011
+*
+*> \ingroup complex16_eig
+*
+*  =====================================================================
+      SUBROUTINE ZLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
+*
+*  -- LAPACK test routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          UPLO
+      INTEGER            INCV, LDC, N
+      COMPLEX*16         TAU
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         C( LDC, * ), V( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         ONE, ZERO, HALF
+      PARAMETER          ( ONE = ( 1.0D+0, 0.0D+0 ),
+     $                   ZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   HALF = ( 0.5D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZAXPY, ZHEMV, ZHER2
+*     ..
+*     .. External Functions ..
+      COMPLEX*16         ZDOTC
+      EXTERNAL           ZDOTC
+*     ..
+*     .. Executable Statements ..
+*
+      IF( TAU.EQ.ZERO )
+     $   RETURN
+*
+*     Form  w:= C * v
+*
+      CALL ZHEMV( UPLO, N, ONE, C, LDC, V, INCV, ZERO, WORK, 1 )
+*
+      ALPHA = -HALF*TAU*ZDOTC( N, WORK, 1, V, INCV )
+      CALL ZAXPY( N, ALPHA, V, INCV, WORK, 1 )
+*
+*     C := C - v * w' - w * v'
+*
+      CALL ZHER2( UPLO, N, -TAU, V, INCV, WORK, 1, C, LDC )
+*
+      RETURN
+*
+*     End of ZLARFY
+*
+      END