diff options
author | julielangou <julie@cs.utk.edu> | 2016-11-19 14:40:33 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-11-19 14:40:33 -0800 |
commit | 01cdfedf1d726a003f7a4e7331f32a7e434f1707 (patch) | |
tree | 4a9ff4318f5a1d66a4dc2690d9dc5ccb242981a3 /SRC | |
parent | ead2c73f1a6dad1342bf32987c0b2f2eaf61f18a (diff) | |
parent | ae36f785d68d637ef0a60f9028c38eb2d369d9f1 (diff) | |
download | lapack-01cdfedf1d726a003f7a4e7331f32a7e434f1707.tar.gz lapack-01cdfedf1d726a003f7a4e7331f32a7e434f1707.tar.bz2 lapack-01cdfedf1d726a003f7a4e7331f32a7e434f1707.zip |
Merge pull request #83 from haidarazzam/master
adding the 2stage symmetric eigenvalue routines drivers checking
Diffstat (limited to 'SRC')
55 files changed, 24112 insertions, 10 deletions
diff --git a/SRC/Makefile b/SRC/Makefile index 01cf7021..c521d7f6 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -56,7 +56,7 @@ include ../make.inc # ####################################################################### -ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o xerbla_array.o iparmq.o \ +ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o xerbla_array.o iparmq.o iparam2stage.o \ ilaprec.o ilatrans.o ilauplo.o iladiag.o chla_transtype.o \ ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o @@ -120,7 +120,7 @@ SLASRC = \ slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ - slarf.o slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slargv.o \ + slarf.o slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \ slarrv.o slartv.o \ slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \ slasyf_rk.o \ @@ -167,7 +167,10 @@ SLASRC = \ sgelqt.o sgelqt3.o sgemlqt.o \ sgetsls.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \ sgelq.o slaswlq.o slamswlq.o sgemlq.o \ - stplqt.o stplqt2.o stpmlqt.o + stplqt.o stplqt2.o stpmlqt.o \ + ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \ + ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \ + ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o DSLASRC = spotrs.o sgetrs.o spotrf.o sgetrf.o @@ -224,7 +227,7 @@ CLASRC = \ claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ clarf.o clarfb.o clarfg.o clarft.o clarfgp.o \ - clarfx.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ + clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ claswp.o clasyf.o clasyf_rook.o clasyf_rk.o \ clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrz.o \ @@ -263,7 +266,10 @@ CLASRC = \ cgelqt.o cgelqt3.o cgemlqt.o \ cgetsls.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \ cgelq.o claswlq.o clamswlq.o cgemlq.o \ - ctplqt.o ctplqt2.o ctpmlqt.o + ctplqt.o ctplqt2.o ctpmlqt.o \ + chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \ + cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \ + chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o ifdef USEXBLAS CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \ @@ -306,7 +312,7 @@ DLASRC = \ dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ - dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o \ + dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \ dlargv.o dlarrv.o dlartv.o \ dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \ dlasyf.o dlasyf_rook.o dlasyf_rk.o \ @@ -355,7 +361,10 @@ DLASRC = \ dgelqt.o dgelqt3.o dgemlqt.o \ dgetsls.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \ dgelq.o dlaswlq.o dlamswlq.o dgemlq.o \ - dtplqt.o dtplqt2.o dtpmlqt.o + dtplqt.o dtplqt2.o dtpmlqt.o \ + dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \ + dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \ + dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o ifdef USEXBLAS DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o \ @@ -413,7 +422,7 @@ ZLASRC = \ zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ zlarcm.o zlarf.o zlarfb.o \ zlarfg.o zlarft.o zlarfgp.o \ - zlarfx.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ + zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ zlassq.o zlaswp.o zlasyf.o zlasyf_rook.o zlasyf_rk.o \ zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlauu2.o \ @@ -455,7 +464,10 @@ ZLASRC = \ zgelqt.o zgelqt3.o zgemlqt.o \ zgetsls.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \ zgelq.o zlaswlq.o zlamswlq.o zgemlq.o \ - ztplqt.o ztplqt2.o ztpmlqt.o + ztplqt.o ztplqt2.o ztpmlqt.o \ + zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \ + zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \ + zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o ifdef USEXBLAS ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o \ diff --git a/SRC/chb2st_kernels.f b/SRC/chb2st_kernels.f new file mode 100644 index 00000000..8b0a4b28 --- /dev/null +++ b/SRC/chb2st_kernels.f @@ -0,0 +1,320 @@ +*> \brief \b CHB2ST_KERNELS +* +* @generated from zhb2st_kernels.f, fortran z -> c, Sun Nov 6 19:34:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CHB2ST_KERNELS + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chb2st_kernels.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chb2st_kernels.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chb2st_kernels.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CHB2ST_KERNELS( UPLO, WANTZ, TTYPE, +* ST, ED, SWEEP, N, NB, IB, +* A, LDA, V, TAU, LDVT, WORK) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER UPLO +* LOGICAL WANTZ +* INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), V( * ), +* TAU( * ), WORK( * ) +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CHB2ST_KERNELS is an internal routine used by the CHETRD_HB2ST +*> subroutine. +*> \endverbatim +* +* Arguments: +* ========== +* +*> @param[in] n +*> The order of the matrix A. +*> +*> @param[in] nb +*> The size of the band. +*> +*> @param[in, out] A +*> A pointer to the matrix A. +*> +*> @param[in] lda +*> The leading dimension of the matrix A. +*> +*> @param[out] V +*> COMPLEX array, dimension 2*n if eigenvalues only are +*> requested or to be queried for vectors. +*> +*> @param[out] TAU +*> COMPLEX array, dimension (2*n). +*> The scalar factors of the Householder reflectors are stored +*> in this array. +*> +*> @param[in] st +*> internal parameter for indices. +*> +*> @param[in] ed +*> internal parameter for indices. +*> +*> @param[in] sweep +*> internal parameter for indices. +*> +*> @param[in] Vblksiz +*> internal parameter for indices. +*> +*> @param[in] wantz +*> logical which indicate if Eigenvalue are requested or both +*> Eigenvalue/Eigenvectors. +*> +*> @param[in] work +*> Workspace of size nb. +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CHB2ST_KERNELS( UPLO, WANTZ, TTYPE, + $ ST, ED, SWEEP, N, NB, IB, + $ A, LDA, V, TAU, LDVT, WORK) +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER UPLO + LOGICAL WANTZ + INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), V( * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), + $ ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, J1, J2, LM, LN, VPOS, TAUPOS, + $ DPOS, OFDPOS, AJETER + COMPLEX CTMP +* .. +* .. External Subroutines .. + EXTERNAL CLARFG, CLARFX, CLARFY +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG, MOD +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. +* .. Executable Statements .. +* + AJETER = IB + LDVT + UPPER = LSAME( UPLO, 'U' ) + + IF( UPPER ) THEN + DPOS = 2 * NB + 1 + OFDPOS = 2 * NB + ELSE + DPOS = 1 + OFDPOS = 2 + ENDIF + +* +* Upper case +* + IF( UPPER ) THEN +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ENDIF + GO TO ( 101, 102, 103 ) TTYPE +* + 101 CONTINUE + LM = ED - ST + 1 +* + V( VPOS ) = ONE + DO 10 I = 1, LM-1 + V( VPOS+I ) = CONJG( A( OFDPOS-I, ST+I ) ) + A( OFDPOS-I, ST+I ) = ZERO + 10 CONTINUE + CTMP = CONJG( A( OFDPOS, ST ) ) + CALL CLARFG( LM, CTMP, V( VPOS+1 ), 1, + $ TAU( TAUPOS ) ) + A( OFDPOS, ST ) = CTMP +* + 103 CONTINUE + LM = ED - ST + 1 + CALL CLARFY( UPLO, LM, V( VPOS ), 1, CONJG( TAU( TAUPOS ) ), + $ A( DPOS, ST ), LDA-1, WORK) + GOTO 300 +* + 102 CONTINUE + J1 = ED+1 + J2 = MIN( ED+NB, N ) + LN = ED-ST+1 + LM = J2-J1+1 + IF( LM.GT.0) THEN + CALL CLARFX( 'Left', LN, LM, V( VPOS ), + $ CONJG( TAU( TAUPOS ) ), A( DPOS-NB, J1 ), + $ LDA-1, WORK) +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ENDIF +* + V( VPOS ) = ONE + DO 30 I = 1, LM-1 + V( VPOS+I ) = CONJG( A( DPOS-NB-I, J1+I ) ) + A( DPOS-NB-I, J1+I ) = ZERO + 30 CONTINUE + CTMP = CONJG( A( DPOS-NB, J1 ) ) + CALL CLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) ) + A( DPOS-NB, J1 ) = CTMP +* + CALL CLARFX( 'Right', LN-1, LM, V( VPOS ), + $ TAU( TAUPOS ), + $ A( DPOS-NB+1, J1 ), LDA-1, WORK) + ENDIF + GOTO 300 +* +* Lower case +* + ELSE +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ENDIF + GO TO ( 201, 202, 203 ) TTYPE +* + 201 CONTINUE + LM = ED - ST + 1 +* + V( VPOS ) = ONE + DO 20 I = 1, LM-1 + V( VPOS+I ) = A( OFDPOS+I, ST-1 ) + A( OFDPOS+I, ST-1 ) = ZERO + 20 CONTINUE + CALL CLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, + $ TAU( TAUPOS ) ) +* + 203 CONTINUE + LM = ED - ST + 1 +* + CALL CLARFY( UPLO, LM, V( VPOS ), 1, CONJG( TAU( TAUPOS ) ), + $ A( DPOS, ST ), LDA-1, WORK) + + GOTO 300 +* + 202 CONTINUE + J1 = ED+1 + J2 = MIN( ED+NB, N ) + LN = ED-ST+1 + LM = J2-J1+1 +* + IF( LM.GT.0) THEN + CALL CLARFX( 'Right', LM, LN, V( VPOS ), + $ TAU( TAUPOS ), A( DPOS+NB, ST ), + $ LDA-1, WORK) +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ENDIF +* + V( VPOS ) = ONE + DO 40 I = 1, LM-1 + V( VPOS+I ) = A( DPOS+NB+I, ST ) + A( DPOS+NB+I, ST ) = ZERO + 40 CONTINUE + CALL CLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, + $ TAU( TAUPOS ) ) +* + CALL CLARFX( 'Left', LM, LN-1, V( VPOS ), + $ CONJG( TAU( TAUPOS ) ), + $ A( DPOS+NB-1, ST+1 ), LDA-1, WORK) + + ENDIF + GOTO 300 + ENDIF + + 300 CONTINUE + RETURN +* +* END OF CHB2ST_KERNELS +* + END diff --git a/SRC/chbev_2stage.f b/SRC/chbev_2stage.f new file mode 100644 index 00000000..182d3d93 --- /dev/null +++ b/SRC/chbev_2stage.f @@ -0,0 +1,386 @@ +*> \brief <b> CHBEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b> +* +* @generated from zhbev_2stage.f, fortran z -> c, Sat Nov 5 23:18:20 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CHBEV_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbev_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbev_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbev_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CHBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, +* WORK, LWORK, RWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, KD, LDAB, LDZ, N, LWORK +* .. +* .. Array Arguments .. +* REAL RWORK( * ), W( * ) +* COMPLEX AB( LDAB, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CHBEV_2STAGE computes all the eigenvalues and, optionally, eigenvectors of +*> a complex Hermitian band matrix A using the 2stage technique for +*> the reduction to tridiagonal. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is COMPLEX array, dimension (LDAB, N) +*> On entry, the upper or lower triangle of the Hermitian band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> +*> On exit, AB is overwritten by values generated during the +*> reduction to tridiagonal form. If UPLO = 'U', the first +*> superdiagonal and the diagonal of the tridiagonal matrix T +*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L', +*> the diagonal and first subdiagonal of T are returned in the +*> first two rows of AB. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD + 1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is COMPLEX array, dimension (LDZ, N) +*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal +*> eigenvectors of the matrix A, with the i-th column of Z +*> holding the eigenvector associated with W(i). +*> If JOBZ = 'N', then Z is not referenced. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension LWORK +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS +*> where KD is the size of the band. +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK, RWORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (max(1,3*N-2)) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> > 0: if INFO = i, the algorithm failed to converge; i +*> off-diagonal elements of an intermediate tridiagonal +*> form did not converge to zero. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complexOTHEReigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CHBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, + $ WORK, LWORK, RWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, KD, LDAB, LDZ, N, LWORK +* .. +* .. Array Arguments .. + REAL RWORK( * ), W( * ) + COMPLEX AB( LDAB, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, WANTZ, LQUERY + INTEGER IINFO, IMAX, INDE, INDWRK, INDRWK, ISCALE, + $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS + REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANHB + EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR + $ CHETRD_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KD.LT.0 ) THEN + INFO = -4 + ELSE IF( LDAB.LT.KD+1 ) THEN + INFO = -6 + ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -9 + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + WORK( 1 ) = LWMIN + ELSE + IB = ILAENV( 18, 'CHETRD_HB2ST', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 ) + LWMIN = LHTRD + LWTRD + WORK( 1 ) = LWMIN + ENDIF +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -11 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHBEV_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + IF( LOWER ) THEN + W( 1 ) = REAL( AB( 1, 1 ) ) + ELSE + W( 1 ) = REAL( AB( KD+1, 1 ) ) + END IF + IF( WANTZ ) + $ Z( 1, 1 ) = ONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = CLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + CALL CLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + ELSE + CALL CLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + END IF + END IF +* +* Call CHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form. +* + INDE = 1 + INDHOUS = 1 + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL CHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, + $ RWORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call SSTERF. For eigenvectors, call CSTEQR. +* + IF( .NOT.WANTZ ) THEN + CALL SSTERF( N, W, RWORK( INDE ), INFO ) + ELSE + INDRWK = INDE + N + CALL CSTEQR( JOBZ, N, W, RWORK( INDE ), Z, LDZ, + $ RWORK( INDRWK ), INFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = N + ELSE + IMAX = INFO - 1 + END IF + CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of CHBEV_2STAGE +* + END diff --git a/SRC/chbevd_2stage.f b/SRC/chbevd_2stage.f new file mode 100644 index 00000000..89c118d3 --- /dev/null +++ b/SRC/chbevd_2stage.f @@ -0,0 +1,458 @@ +*> \brief <b> CHBEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b> +* +* @generated from zhbevd_2stage.f, fortran z -> c, Sat Nov 5 23:18:17 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CHBEVD_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbevd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbevd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbevd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CHBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, +* WORK, LWORK, RWORK, LRWORK, IWORK, +* LIWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, KD, LDAB, LDZ, LIWORK, LRWORK, LWORK, N +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ) +* REAL RWORK( * ), W( * ) +* COMPLEX AB( LDAB, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CHBEVD_2STAGE computes all the eigenvalues and, optionally, eigenvectors of +*> a complex Hermitian band matrix A using the 2stage technique for +*> the reduction to tridiagonal. If eigenvectors are desired, it +*> uses a divide and conquer algorithm. +*> +*> The divide and conquer algorithm makes very mild assumptions about +*> floating point arithmetic. It will work on machines with a guard +*> digit in add/subtract, or on those binary machines without guard +*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or +*> Cray-2. It could conceivably fail on hexadecimal or decimal machines +*> without guard digits, but we know of none. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is COMPLEX array, dimension (LDAB, N) +*> On entry, the upper or lower triangle of the Hermitian band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> +*> On exit, AB is overwritten by values generated during the +*> reduction to tridiagonal form. If UPLO = 'U', the first +*> superdiagonal and the diagonal of the tridiagonal matrix T +*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L', +*> the diagonal and first subdiagonal of T are returned in the +*> first two rows of AB. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD + 1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is COMPLEX array, dimension (LDZ, N) +*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal +*> eigenvectors of the matrix A, with the i-th column of Z +*> holding the eigenvector associated with W(i). +*> If JOBZ = 'N', then Z is not referenced. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS +*> where KD is the size of the band. +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK, RWORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, +*> dimension (LRWORK) +*> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER +*> The dimension of array RWORK. +*> If N <= 1, LRWORK must be at least 1. +*> If JOBZ = 'N' and N > 1, LRWORK must be at least N. +*> If JOBZ = 'V' and N > 1, LRWORK must be at least +*> 1 + 5*N + 2*N**2. +*> +*> If LRWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK, RWORK +*> and IWORK arrays, returns these values as the first entries +*> of the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of array IWORK. +*> If JOBZ = 'N' or N <= 1, LIWORK must be at least 1. +*> If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N . +*> +*> If LIWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK, RWORK +*> and IWORK arrays, returns these values as the first entries +*> of the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> > 0: if INFO = i, the algorithm failed to converge; i +*> off-diagonal elements of an intermediate tridiagonal +*> form did not converge to zero. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complexOTHEReigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CHBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, + $ WORK, LWORK, RWORK, LRWORK, IWORK, + $ LIWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, KD, LDAB, LDZ, LIWORK, LRWORK, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + REAL RWORK( * ), W( * ) + COMPLEX AB( LDAB, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E0, 0.0E0 ), + $ CONE = ( 1.0E0, 0.0E0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, LQUERY, WANTZ + INTEGER IINFO, IMAX, INDE, INDWK2, INDRWK, ISCALE, + $ LLWORK, INDWK, LHTRD, LWTRD, IB, INDHOUS, + $ LIWMIN, LLRWK, LLWK2, LRWMIN, LWMIN + REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANHB + EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SSCAL, SSTERF, XERBLA, CGEMM, CLACPY, + $ CLASCL, CSTEDC, CHETRD_HB2ST +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) +* + INFO = 0 + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + IB = ILAENV( 18, 'CHETRD_HB2ST', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 ) + IF( WANTZ ) THEN + LWMIN = 2*N**2 + LRWMIN = 1 + 5*N + 2*N**2 + LIWMIN = 3 + 5*N + ELSE + LWMIN = MAX( N, LHTRD + LWTRD ) + LRWMIN = N + LIWMIN = 1 + END IF + END IF + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KD.LT.0 ) THEN + INFO = -4 + ELSE IF( LDAB.LT.KD+1 ) THEN + INFO = -6 + ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -9 + END IF +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN + IWORK( 1 ) = LIWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN + INFO = -13 + ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHBEVD_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + W( 1 ) = REAL( AB( 1, 1 ) ) + IF( WANTZ ) + $ Z( 1, 1 ) = CONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = CLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + CALL CLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + ELSE + CALL CLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + END IF + END IF +* +* Call CHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form. +* + INDE = 1 + INDRWK = INDE + N + LLRWK = LRWORK - INDRWK + 1 + INDHOUS = 1 + INDWK = INDHOUS + LHTRD + LLWORK = LWORK - INDWK + 1 + INDWK2 = INDWK + N*N + LLWK2 = LWORK - INDWK2 + 1 +* + CALL CHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, + $ RWORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWK ), LLWORK, IINFO ) +* +* For eigenvalues only, call SSTERF. For eigenvectors, call CSTEDC. +* + IF( .NOT.WANTZ ) THEN + CALL SSTERF( N, W, RWORK( INDE ), INFO ) + ELSE + CALL CSTEDC( 'I', N, W, RWORK( INDE ), WORK, N, WORK( INDWK2 ), + $ LLWK2, RWORK( INDRWK ), LLRWK, IWORK, LIWORK, + $ INFO ) + CALL CGEMM( 'N', 'N', N, N, N, CONE, Z, LDZ, WORK, N, CZERO, + $ WORK( INDWK2 ), N ) + CALL CLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = N + ELSE + IMAX = INFO - 1 + END IF + CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* + WORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN + IWORK( 1 ) = LIWMIN + RETURN +* +* End of CHBEVD_2STAGE +* + END diff --git a/SRC/chbevx_2stage.f b/SRC/chbevx_2stage.f new file mode 100644 index 00000000..07eb6153 --- /dev/null +++ b/SRC/chbevx_2stage.f @@ -0,0 +1,646 @@ +*> \brief <b> CHBEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b> +* +* @generated from zhbevx_2stage.f, fortran z -> c, Sat Nov 5 23:18:22 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CHBEVX_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbevx_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbevx_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbevx_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, +* Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W, +* Z, LDZ, WORK, LWORK, RWORK, IWORK, +* IFAIL, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, RANGE, UPLO +* INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK +* REAL ABSTOL, VL, VU +* .. +* .. Array Arguments .. +* INTEGER IFAIL( * ), IWORK( * ) +* REAL RWORK( * ), W( * ) +* COMPLEX AB( LDAB, * ), Q( LDQ, * ), WORK( * ), +* $ Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CHBEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors +*> of a complex Hermitian band matrix A using the 2stage technique for +*> the reduction to tridiagonal. Eigenvalues and eigenvectors +*> can be selected by specifying either a range of values or a range of +*> indices for the desired eigenvalues. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] RANGE +*> \verbatim +*> RANGE is CHARACTER*1 +*> = 'A': all eigenvalues will be found; +*> = 'V': all eigenvalues in the half-open interval (VL,VU] +*> will be found; +*> = 'I': the IL-th through IU-th eigenvalues will be found. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is COMPLEX array, dimension (LDAB, N) +*> On entry, the upper or lower triangle of the Hermitian band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> +*> On exit, AB is overwritten by values generated during the +*> reduction to tridiagonal form. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD + 1. +*> \endverbatim +*> +*> \param[out] Q +*> \verbatim +*> Q is COMPLEX array, dimension (LDQ, N) +*> If JOBZ = 'V', the N-by-N unitary matrix used in the +*> reduction to tridiagonal form. +*> If JOBZ = 'N', the array Q is not referenced. +*> \endverbatim +*> +*> \param[in] LDQ +*> \verbatim +*> LDQ is INTEGER +*> The leading dimension of the array Q. If JOBZ = 'V', then +*> LDQ >= max(1,N). +*> \endverbatim +*> +*> \param[in] VL +*> \verbatim +*> VL is REAL +*> If RANGE='V', the lower bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] VU +*> \verbatim +*> VU is REAL +*> If RANGE='V', the upper bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] IL +*> \verbatim +*> IL is INTEGER +*> If RANGE='I', the index of the +*> smallest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] IU +*> \verbatim +*> IU is INTEGER +*> If RANGE='I', the index of the +*> largest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL +*> The absolute error tolerance for the eigenvalues. +*> An approximate eigenvalue is accepted as converged +*> when it is determined to lie in an interval [a,b] +*> of width less than or equal to +*> +*> ABSTOL + EPS * max( |a|,|b| ) , +*> +*> where EPS is the machine precision. If ABSTOL is less than +*> or equal to zero, then EPS*|T| will be used in its place, +*> where |T| is the 1-norm of the tridiagonal matrix obtained +*> by reducing AB to tridiagonal form. +*> +*> Eigenvalues will be computed most accurately when ABSTOL is +*> set to twice the underflow threshold 2*SLAMCH('S'), not zero. +*> If this routine returns with INFO>0, indicating that some +*> eigenvectors did not converge, try setting ABSTOL to +*> 2*SLAMCH('S'). +*> +*> See "Computing Small Singular Values of Bidiagonal Matrices +*> with Guaranteed High Relative Accuracy," by Demmel and +*> Kahan, LAPACK Working Note #3. +*> \endverbatim +*> +*> \param[out] M +*> \verbatim +*> M is INTEGER +*> The total number of eigenvalues found. 0 <= M <= N. +*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> The first M elements contain the selected eigenvalues in +*> ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is COMPLEX array, dimension (LDZ, max(1,M)) +*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z +*> contain the orthonormal eigenvectors of the matrix A +*> corresponding to the selected eigenvalues, with the i-th +*> column of Z holding the eigenvector associated with W(i). +*> If an eigenvector fails to converge, then that column of Z +*> contains the latest approximation to the eigenvector, and the +*> index of the eigenvector is returned in IFAIL. +*> If JOBZ = 'N', then Z is not referenced. +*> Note: the user must ensure that at least max(1,M) columns are +*> supplied in the array Z; if RANGE = 'V', the exact value of M +*> is not known in advance and an upper bound must be used. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (LWORK) +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS +*> where KD is the size of the band. +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK, RWORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (7*N) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (5*N) +*> \endverbatim +*> +*> \param[out] IFAIL +*> \verbatim +*> IFAIL is INTEGER array, dimension (N) +*> If JOBZ = 'V', then if INFO = 0, the first M elements of +*> IFAIL are zero. If INFO > 0, then IFAIL contains the +*> indices of the eigenvectors that failed to converge. +*> If JOBZ = 'N', then IFAIL is not referenced. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, then i eigenvectors failed to converge. +*> Their indices are stored in array IFAIL. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup complexOTHEReigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, + $ Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W, + $ Z, LDZ, WORK, LWORK, RWORK, IWORK, + $ IFAIL, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, RANGE, UPLO + INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK + REAL ABSTOL, VL, VU +* .. +* .. Array Arguments .. + INTEGER IFAIL( * ), IWORK( * ) + REAL RWORK( * ), W( * ) + COMPLEX AB( LDAB, * ), Q( LDQ, * ), WORK( * ), + $ Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E0, 0.0E0 ), + $ CONE = ( 1.0E0, 0.0E0 ) ) +* .. +* .. Local Scalars .. + LOGICAL ALLEIG, INDEIG, LOWER, TEST, VALEIG, WANTZ, + $ LQUERY + CHARACTER ORDER + INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, + $ INDISP, INDIWK, INDRWK, INDWRK, ISCALE, ITMP1, + $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS, + $ J, JJ, NSPLIT + REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, + $ SIGMA, SMLNUM, TMP1, VLL, VUU + COMPLEX CTMP1 +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANHB + EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CCOPY, + $ CGEMV, CLACPY, CLASCL, CSTEIN, CSTEQR, + $ CSWAP, CHETRD_HB2ST +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + ALLEIG = LSAME( RANGE, 'A' ) + VALEIG = LSAME( RANGE, 'V' ) + INDEIG = LSAME( RANGE, 'I' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN + INFO = -2 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( KD.LT.0 ) THEN + INFO = -5 + ELSE IF( LDAB.LT.KD+1 ) THEN + INFO = -7 + ELSE IF( WANTZ .AND. LDQ.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE + IF( VALEIG ) THEN + IF( N.GT.0 .AND. VU.LE.VL ) + $ INFO = -11 + ELSE IF( INDEIG ) THEN + IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN + INFO = -12 + ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN + INFO = -13 + END IF + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) + $ INFO = -18 + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + WORK( 1 ) = LWMIN + ELSE + IB = ILAENV( 18, 'CHETRD_HB2ST', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'CHETRD_HB2ST', JOBZ, N, KD, IB, -1 ) + LWMIN = LHTRD + LWTRD + WORK( 1 ) = LWMIN + ENDIF +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -20 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHBEVX_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + M = 0 + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + M = 1 + IF( LOWER ) THEN + CTMP1 = AB( 1, 1 ) + ELSE + CTMP1 = AB( KD+1, 1 ) + END IF + TMP1 = REAL( CTMP1 ) + IF( VALEIG ) THEN + IF( .NOT.( VL.LT.TMP1 .AND. VU.GE.TMP1 ) ) + $ M = 0 + END IF + IF( M.EQ.1 ) THEN + W( 1 ) = REAL( CTMP1 ) + IF( WANTZ ) + $ Z( 1, 1 ) = CONE + END IF + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) +* +* Scale matrix to allowable range, if necessary. +* + ISCALE = 0 + ABSTLL = ABSTOL + IF( VALEIG ) THEN + VLL = VL + VUU = VU + ELSE + VLL = ZERO + VUU = ZERO + END IF + ANRM = CLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK ) + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + CALL CLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + ELSE + CALL CLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + END IF + IF( ABSTOL.GT.0 ) + $ ABSTLL = ABSTOL*SIGMA + IF( VALEIG ) THEN + VLL = VL*SIGMA + VUU = VU*SIGMA + END IF + END IF +* +* Call CHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form. +* + INDD = 1 + INDE = INDD + N + INDRWK = INDE + N +* + INDHOUS = 1 + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL CHETRD_HB2ST( 'N', JOBZ, UPLO, N, KD, AB, LDAB, + $ RWORK( INDD ), RWORK( INDE ), WORK( INDHOUS ), + $ LHTRD, WORK( INDWRK ), LLWORK, IINFO ) +* +* If all eigenvalues are desired and ABSTOL is less than or equal +* to zero, then call SSTERF or CSTEQR. If this fails for some +* eigenvalue, then try SSTEBZ. +* + TEST = .FALSE. + IF (INDEIG) THEN + IF (IL.EQ.1 .AND. IU.EQ.N) THEN + TEST = .TRUE. + END IF + END IF + IF ((ALLEIG .OR. TEST) .AND. (ABSTOL.LE.ZERO)) THEN + CALL SCOPY( N, RWORK( INDD ), 1, W, 1 ) + INDEE = INDRWK + 2*N + IF( .NOT.WANTZ ) THEN + CALL SCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 ) + CALL SSTERF( N, W, RWORK( INDEE ), INFO ) + ELSE + CALL CLACPY( 'A', N, N, Q, LDQ, Z, LDZ ) + CALL SCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 ) + CALL CSTEQR( JOBZ, N, W, RWORK( INDEE ), Z, LDZ, + $ RWORK( INDRWK ), INFO ) + IF( INFO.EQ.0 ) THEN + DO 10 I = 1, N + IFAIL( I ) = 0 + 10 CONTINUE + END IF + END IF + IF( INFO.EQ.0 ) THEN + M = N + GO TO 30 + END IF + INFO = 0 + END IF +* +* Otherwise, call SSTEBZ and, if eigenvectors are desired, CSTEIN. +* + IF( WANTZ ) THEN + ORDER = 'B' + ELSE + ORDER = 'E' + END IF + INDIBL = 1 + INDISP = INDIBL + N + INDIWK = INDISP + N + CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, + $ RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W, + $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ), + $ IWORK( INDIWK ), INFO ) +* + IF( WANTZ ) THEN + CALL CSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W, + $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO ) +* +* Apply unitary matrix used in reduction to tridiagonal +* form to eigenvectors returned by CSTEIN. +* + DO 20 J = 1, M + CALL CCOPY( N, Z( 1, J ), 1, WORK( 1 ), 1 ) + CALL CGEMV( 'N', N, N, CONE, Q, LDQ, WORK, 1, CZERO, + $ Z( 1, J ), 1 ) + 20 CONTINUE + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + 30 CONTINUE + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = M + ELSE + IMAX = INFO - 1 + END IF + CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* If eigenvalues are not in order, then sort them, along with +* eigenvectors. +* + IF( WANTZ ) THEN + DO 50 J = 1, M - 1 + I = 0 + TMP1 = W( J ) + DO 40 JJ = J + 1, M + IF( W( JJ ).LT.TMP1 ) THEN + I = JJ + TMP1 = W( JJ ) + END IF + 40 CONTINUE +* + IF( I.NE.0 ) THEN + ITMP1 = IWORK( INDIBL+I-1 ) + W( I ) = W( J ) + IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + W( J ) = TMP1 + IWORK( INDIBL+J-1 ) = ITMP1 + CALL CSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) + IF( INFO.NE.0 ) THEN + ITMP1 = IFAIL( I ) + IFAIL( I ) = IFAIL( J ) + IFAIL( J ) = ITMP1 + END IF + END IF + 50 CONTINUE + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of CHBEVX_2STAGE +* + END diff --git a/SRC/cheev_2stage.f b/SRC/cheev_2stage.f new file mode 100644 index 00000000..b98dac76 --- /dev/null +++ b/SRC/cheev_2stage.f @@ -0,0 +1,355 @@ +*> \brief <b> CHEEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b> +* +* @generated from zheev_2stage.f, fortran z -> c, Sat Nov 5 23:18:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CHEEV_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cheev_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cheev_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cheev_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* RWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. +* REAL RWORK( * ), W( * ) +* COMPLEX A( LDA, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CHEEV_2STAGE computes all eigenvalues and, optionally, eigenvectors of a +*> complex Hermitian matrix A using the 2stage technique for +*> the reduction to tridiagonal. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA, N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the +*> orthonormal eigenvectors of the matrix A. +*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') +*> or the upper triangle (if UPLO='U') of A, including the +*> diagonal, is destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (max(1, 3*N-2)) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the algorithm failed to converge; i +*> off-diagonal elements of an intermediate tridiagonal +*> form did not converge to zero. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complexHEeigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + $ RWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. + REAL RWORK( * ), W( * ) + COMPLEX A( LDA, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E0, 0.0E0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, LQUERY, WANTZ + INTEGER IINFO, IMAX, INDE, INDTAU, INDWRK, ISCALE, + $ LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS + REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANHE + EXTERNAL LSAME, ILAENV, SLAMCH, CLANHE +* .. +* .. External Subroutines .. + EXTERNAL SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR, + $ CUNGTR, CHETRD_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF +* + IF( INFO.EQ.0 ) THEN + KD = ILAENV( 17, 'CHETRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = N + LHTRD + LWTRD + WORK( 1 ) = LWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -8 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHEEV_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) THEN + RETURN + END IF +* + IF( N.EQ.1 ) THEN + W( 1 ) = REAL( A( 1, 1 ) ) + WORK( 1 ) = 1 + IF( WANTZ ) + $ A( 1, 1 ) = CONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = CLANHE( 'M', UPLO, N, A, LDA, RWORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) + $ CALL CLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO ) +* +* Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. +* + INDE = 1 + INDTAU = 1 + INDHOUS = INDTAU + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ), + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call SSTERF. For eigenvectors, first call +* CUNGTR to generate the unitary matrix, then call CSTEQR. +* + IF( .NOT.WANTZ ) THEN + CALL SSTERF( N, W, RWORK( INDE ), INFO ) + ELSE + CALL CUNGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ), + $ LLWORK, IINFO ) + INDWRK = INDE + N + CALL CSTEQR( JOBZ, N, W, RWORK( INDE ), A, LDA, + $ RWORK( INDWRK ), INFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = N + ELSE + IMAX = INFO - 1 + END IF + CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* Set WORK(1) to optimal complex workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of CHEEV_2STAGE +* + END diff --git a/SRC/cheevd_2stage.f b/SRC/cheevd_2stage.f new file mode 100644 index 00000000..9d1057fc --- /dev/null +++ b/SRC/cheevd_2stage.f @@ -0,0 +1,451 @@ +*> \brief <b> CHEEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b> +* +* @generated from zheevd_2stage.f, fortran z -> c, Sat Nov 5 23:18:14 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CHEEVD_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cheevd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cheevd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cheevd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CHEEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* RWORK, LRWORK, IWORK, LIWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, LDA, LIWORK, LRWORK, LWORK, N +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ) +* REAL RWORK( * ), W( * ) +* COMPLEX A( LDA, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a +*> complex Hermitian matrix A using the 2stage technique for +*> the reduction to tridiagonal. If eigenvectors are desired, it uses a +*> divide and conquer algorithm. +*> +*> The divide and conquer algorithm makes very mild assumptions about +*> floating point arithmetic. It will work on machines with a guard +*> digit in add/subtract, or on those binary machines without guard +*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or +*> Cray-2. It could conceivably fail on hexadecimal or decimal machines +*> without guard digits, but we know of none. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA, N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the +*> orthonormal eigenvectors of the matrix A. +*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') +*> or the upper triangle (if UPLO='U') of A, including the +*> diagonal, is destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + N+1 +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + N+1 +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be at least 2*N + N**2 +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK, RWORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, +*> dimension (LRWORK) +*> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER +*> The dimension of the array RWORK. +*> If N <= 1, LRWORK must be at least 1. +*> If JOBZ = 'N' and N > 1, LRWORK must be at least N. +*> If JOBZ = 'V' and N > 1, LRWORK must be at least +*> 1 + 5*N + 2*N**2. +*> +*> If LRWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK, RWORK +*> and IWORK arrays, returns these values as the first entries +*> of the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK must be at least 1. +*> If JOBZ = 'N' and N > 1, LIWORK must be at least 1. +*> If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N. +*> +*> If LIWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK, RWORK +*> and IWORK arrays, returns these values as the first entries +*> of the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i and JOBZ = 'N', then the algorithm failed +*> to converge; i off-diagonal elements of an intermediate +*> tridiagonal form did not converge to zero; +*> if INFO = i and JOBZ = 'V', then the algorithm failed +*> to compute an eigenvalue while working on the submatrix +*> lying in rows and columns INFO/(N+1) through +*> mod(INFO,N+1). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complexHEeigen +* +*> \par Further Details: +* ===================== +*> +*> Modified description of INFO. Sven, 16 Feb 05. +* +*> \par Contributors: +* ================== +*> +*> Jeff Rutter, Computer Science Division, University of California +*> at Berkeley, USA +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CHEEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + $ RWORK, LRWORK, IWORK, LIWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, LDA, LIWORK, LRWORK, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + REAL RWORK( * ), W( * ) + COMPLEX A( LDA, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E0, 0.0E0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, LQUERY, WANTZ + INTEGER IINFO, IMAX, INDE, INDRWK, INDTAU, INDWK2, + $ INDWRK, ISCALE, LIWMIN, LLRWK, LLWORK, + $ LLWRK2, LRWMIN, LWMIN, + $ LHTRD, LWTRD, KD, IB, INDHOUS + + + REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANHE + EXTERNAL LSAME, ILAENV, SLAMCH, CLANHE +* .. +* .. External Subroutines .. + EXTERNAL SSCAL, SSTERF, XERBLA, CLACPY, CLASCL, + $ CSTEDC, CUNMTR, CHETRD_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + KD = ILAENV( 17, 'CHETRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + IF( WANTZ ) THEN + LWMIN = 2*N + N*N + LRWMIN = 1 + 5*N + 2*N**2 + LIWMIN = 3 + 5*N + ELSE + LWMIN = N + 1 + LHTRD + LWTRD + LRWMIN = N + LIWMIN = 1 + END IF + END IF + WORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN + IWORK( 1 ) = LIWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -8 + ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN + INFO = -10 + ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN + INFO = -12 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHEEVD_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + W( 1 ) = REAL( A( 1, 1 ) ) + IF( WANTZ ) + $ A( 1, 1 ) = CONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = CLANHE( 'M', UPLO, N, A, LDA, RWORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) + $ CALL CLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO ) +* +* Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. +* + INDE = 1 + INDRWK = INDE + N + LLRWK = LRWORK - INDRWK + 1 + INDTAU = 1 + INDHOUS = INDTAU + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 + INDWK2 = INDWRK + N*N + LLWRK2 = LWORK - INDWK2 + 1 +* + CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ), + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call SSTERF. For eigenvectors, first call +* CSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the +* tridiagonal matrix, then call CUNMTR to multiply it to the +* Householder transformations represented as Householder vectors in +* A. +* + IF( .NOT.WANTZ ) THEN + CALL SSTERF( N, W, RWORK( INDE ), INFO ) + ELSE + CALL CSTEDC( 'I', N, W, RWORK( INDE ), WORK( INDWRK ), N, + $ WORK( INDWK2 ), LLWRK2, RWORK( INDRWK ), LLRWK, + $ IWORK, LIWORK, INFO ) + CALL CUNMTR( 'L', UPLO, 'N', N, N, A, LDA, WORK( INDTAU ), + $ WORK( INDWRK ), N, WORK( INDWK2 ), LLWRK2, IINFO ) + CALL CLACPY( 'A', N, N, WORK( INDWRK ), N, A, LDA ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = N + ELSE + IMAX = INFO - 1 + END IF + CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* + WORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN + IWORK( 1 ) = LIWMIN +* + RETURN +* +* End of CHEEVD_2STAGE +* + END diff --git a/SRC/cheevr_2stage.f b/SRC/cheevr_2stage.f new file mode 100644 index 00000000..23a98389 --- /dev/null +++ b/SRC/cheevr_2stage.f @@ -0,0 +1,779 @@ +*> \brief <b> CHEEVR_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b> +* +* @generated from zheevr_2stage.f, fortran z -> c, Sat Nov 5 23:18:11 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CHEEVR_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cheevr_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cheevr_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cheevr_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CHEEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, +* IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, +* WORK, LWORK, RWORK, LRWORK, IWORK, +* LIWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, RANGE, UPLO +* INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LRWORK, LWORK, +* $ M, N +* REAL ABSTOL, VL, VU +* .. +* .. Array Arguments .. +* INTEGER ISUPPZ( * ), IWORK( * ) +* REAL RWORK( * ), W( * ) +* COMPLEX A( LDA, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CHEEVR_2STAGE computes selected eigenvalues and, optionally, eigenvectors +*> of a complex Hermitian matrix A using the 2stage technique for +*> the reduction to tridiagonal. Eigenvalues and eigenvectors can +*> be selected by specifying either a range of values or a range of +*> indices for the desired eigenvalues. +*> +*> CHEEVR_2STAGE first reduces the matrix A to tridiagonal form T with a call +*> to CHETRD. Then, whenever possible, CHEEVR_2STAGE calls CSTEMR to compute +*> eigenspectrum using Relatively Robust Representations. CSTEMR +*> computes eigenvalues by the dqds algorithm, while orthogonal +*> eigenvectors are computed from various "good" L D L^T representations +*> (also known as Relatively Robust Representations). Gram-Schmidt +*> orthogonalization is avoided as far as possible. More specifically, +*> the various steps of the algorithm are as follows. +*> +*> For each unreduced block (submatrix) of T, +*> (a) Compute T - sigma I = L D L^T, so that L and D +*> define all the wanted eigenvalues to high relative accuracy. +*> This means that small relative changes in the entries of D and L +*> cause only small relative changes in the eigenvalues and +*> eigenvectors. The standard (unfactored) representation of the +*> tridiagonal matrix T does not have this property in general. +*> (b) Compute the eigenvalues to suitable accuracy. +*> If the eigenvectors are desired, the algorithm attains full +*> accuracy of the computed eigenvalues only right before +*> the corresponding vectors have to be computed, see steps c) and d). +*> (c) For each cluster of close eigenvalues, select a new +*> shift close to the cluster, find a new factorization, and refine +*> the shifted eigenvalues to suitable accuracy. +*> (d) For each eigenvalue with a large enough relative separation compute +*> the corresponding eigenvector by forming a rank revealing twisted +*> factorization. Go back to (c) for any clusters that remain. +*> +*> The desired accuracy of the output can be specified by the input +*> parameter ABSTOL. +*> +*> For more details, see DSTEMR's documentation and: +*> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations +*> to compute orthogonal eigenvectors of symmetric tridiagonal matrices," +*> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004. +*> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and +*> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25, +*> 2004. Also LAPACK Working Note 154. +*> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric +*> tridiagonal eigenvalue/eigenvector problem", +*> Computer Science Division Technical Report No. UCB/CSD-97-971, +*> UC Berkeley, May 1997. +*> +*> +*> Note 1 : CHEEVR_2STAGE calls CSTEMR when the full spectrum is requested +*> on machines which conform to the ieee-754 floating point standard. +*> CHEEVR_2STAGE calls SSTEBZ and CSTEIN on non-ieee machines and +*> when partial spectrum requests are made. +*> +*> Normal execution of CSTEMR may create NaNs and infinities and +*> hence may abort due to a floating point exception in environments +*> which do not handle NaNs and infinities in the ieee standard default +*> manner. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] RANGE +*> \verbatim +*> RANGE is CHARACTER*1 +*> = 'A': all eigenvalues will be found. +*> = 'V': all eigenvalues in the half-open interval (VL,VU] +*> will be found. +*> = 'I': the IL-th through IU-th eigenvalues will be found. +*> For RANGE = 'V' or 'I' and IU - IL < N - 1, SSTEBZ and +*> CSTEIN are called +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA, N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, the lower triangle (if UPLO='L') or the upper +*> triangle (if UPLO='U') of A, including the diagonal, is +*> destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in] VL +*> \verbatim +*> VL is REAL +*> If RANGE='V', the lower bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] VU +*> \verbatim +*> VU is REAL +*> If RANGE='V', the upper bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] IL +*> \verbatim +*> IL is INTEGER +*> If RANGE='I', the index of the +*> smallest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] IU +*> \verbatim +*> IU is INTEGER +*> If RANGE='I', the index of the +*> largest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL +*> The absolute error tolerance for the eigenvalues. +*> An approximate eigenvalue is accepted as converged +*> when it is determined to lie in an interval [a,b] +*> of width less than or equal to +*> +*> ABSTOL + EPS * max( |a|,|b| ) , +*> +*> where EPS is the machine precision. If ABSTOL is less than +*> or equal to zero, then EPS*|T| will be used in its place, +*> where |T| is the 1-norm of the tridiagonal matrix obtained +*> by reducing A to tridiagonal form. +*> +*> See "Computing Small Singular Values of Bidiagonal Matrices +*> with Guaranteed High Relative Accuracy," by Demmel and +*> Kahan, LAPACK Working Note #3. +*> +*> If high relative accuracy is important, set ABSTOL to +*> SLAMCH( 'Safe minimum' ). Doing so will guarantee that +*> eigenvalues are computed to high relative accuracy when +*> possible in future releases. The current code does not +*> make any guarantees about high relative accuracy, but +*> furutre releases will. See J. Barlow and J. Demmel, +*> "Computing Accurate Eigensystems of Scaled Diagonally +*> Dominant Matrices", LAPACK Working Note #7, for a discussion +*> of which matrices define their eigenvalues to high relative +*> accuracy. +*> \endverbatim +*> +*> \param[out] M +*> \verbatim +*> M is INTEGER +*> The total number of eigenvalues found. 0 <= M <= N. +*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> The first M elements contain the selected eigenvalues in +*> ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is COMPLEX array, dimension (LDZ, max(1,M)) +*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z +*> contain the orthonormal eigenvectors of the matrix A +*> corresponding to the selected eigenvalues, with the i-th +*> column of Z holding the eigenvector associated with W(i). +*> If JOBZ = 'N', then Z is not referenced. +*> Note: the user must ensure that at least max(1,M) columns are +*> supplied in the array Z; if RANGE = 'V', the exact value of M +*> is not known in advance and an upper bound must be used. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] ISUPPZ +*> \verbatim +*> ISUPPZ is INTEGER array, dimension ( 2*max(1,M) ) +*> The support of the eigenvectors in Z, i.e., the indices +*> indicating the nonzero elements in Z. The i-th eigenvector +*> is nonzero only in elements ISUPPZ( 2*i-1 ) through +*> ISUPPZ( 2*i ). This is an output of CSTEMR (tridiagonal +*> matrix). The support of the eigenvectors of A is typically +*> 1:N because of the unitary transformations applied by CUNMTR. +*> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, 26*N, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK, RWORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (MAX(1,LRWORK)) +*> On exit, if INFO = 0, RWORK(1) returns the optimal +*> (and minimal) LRWORK. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER +*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> +*> If LRWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK, RWORK +*> and IWORK arrays, returns these values as the first entries +*> of the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal +*> (and minimal) LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> +*> If LIWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK, RWORK +*> and IWORK arrays, returns these values as the first entries +*> of the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: Internal error +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup complexHEeigen +* +*> \par Contributors: +* ================== +*> +*> Inderjit Dhillon, IBM Almaden, USA \n +*> Osni Marques, LBNL/NERSC, USA \n +*> Ken Stanley, Computer Science Division, University of +*> California at Berkeley, USA \n +*> Jason Riedy, Computer Science Division, University of +*> California at Berkeley, USA \n +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CHEEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, + $ IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, + $ WORK, LWORK, RWORK, LRWORK, IWORK, + $ LIWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, RANGE, UPLO + INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LRWORK, LWORK, + $ M, N + REAL ABSTOL, VL, VU +* .. +* .. Array Arguments .. + INTEGER ISUPPZ( * ), IWORK( * ) + REAL RWORK( * ), W( * ) + COMPLEX A( LDA, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG, + $ WANTZ, TRYRAC + CHARACTER ORDER + INTEGER I, IEEEOK, IINFO, IMAX, INDIBL, INDIFL, INDISP, + $ INDIWO, INDRD, INDRDD, INDRE, INDREE, INDRWK, + $ INDTAU, INDWK, INDWKN, ISCALE, ITMP1, J, JJ, + $ LIWMIN, LLWORK, LLRWORK, LLWRKN, LRWMIN, + $ LWMIN, NSPLIT, LHTRD, LWTRD, KD, IB, INDHOUS + REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, + $ SIGMA, SMLNUM, TMP1, VLL, VUU +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANSY + EXTERNAL LSAME, ILAENV, SLAMCH, CLANSY +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL, + $ CHETRD_2STAGE, CSTEMR, CSTEIN, CSWAP, CUNMTR +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IEEEOK = ILAENV( 10, 'CHEEVR', 'N', 1, 2, 3, 4 ) +* + LOWER = LSAME( UPLO, 'L' ) + WANTZ = LSAME( JOBZ, 'V' ) + ALLEIG = LSAME( RANGE, 'A' ) + VALEIG = LSAME( RANGE, 'V' ) + INDEIG = LSAME( RANGE, 'I' ) +* + LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR. + $ ( LIWORK.EQ.-1 ) ) +* + KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = N + LHTRD + LWTRD + LRWMIN = MAX( 1, 24*N ) + LIWMIN = MAX( 1, 10*N ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN + INFO = -2 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE + IF( VALEIG ) THEN + IF( N.GT.0 .AND. VU.LE.VL ) + $ INFO = -8 + ELSE IF( INDEIG ) THEN + IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN + INFO = -10 + END IF + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -15 + END IF + END IF +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN + IWORK( 1 ) = LIWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -18 + ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN + INFO = -20 + ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN + INFO = -22 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHEEVR_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + M = 0 + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* + IF( N.EQ.1 ) THEN + WORK( 1 ) = 2 + IF( ALLEIG .OR. INDEIG ) THEN + M = 1 + W( 1 ) = REAL( A( 1, 1 ) ) + ELSE + IF( VL.LT.REAL( A( 1, 1 ) ) .AND. VU.GE.REAL( A( 1, 1 ) ) ) + $ THEN + M = 1 + W( 1 ) = REAL( A( 1, 1 ) ) + END IF + END IF + IF( WANTZ ) THEN + Z( 1, 1 ) = ONE + ISUPPZ( 1 ) = 1 + ISUPPZ( 2 ) = 1 + END IF + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) +* +* Scale matrix to allowable range, if necessary. +* + ISCALE = 0 + ABSTLL = ABSTOL + IF (VALEIG) THEN + VLL = VL + VUU = VU + END IF + ANRM = CLANSY( 'M', UPLO, N, A, LDA, RWORK ) + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + DO 10 J = 1, N + CALL CSSCAL( N-J+1, SIGMA, A( J, J ), 1 ) + 10 CONTINUE + ELSE + DO 20 J = 1, N + CALL CSSCAL( J, SIGMA, A( 1, J ), 1 ) + 20 CONTINUE + END IF + IF( ABSTOL.GT.0 ) + $ ABSTLL = ABSTOL*SIGMA + IF( VALEIG ) THEN + VLL = VL*SIGMA + VUU = VU*SIGMA + END IF + END IF + +* Initialize indices into workspaces. Note: The IWORK indices are +* used only if SSTERF or CSTEMR fail. + +* WORK(INDTAU:INDTAU+N-1) stores the complex scalar factors of the +* elementary reflectors used in CHETRD. + INDTAU = 1 +* INDWK is the starting offset of the remaining complex workspace, +* and LLWORK is the remaining complex workspace size. + INDHOUS = INDTAU + N + INDWK = INDHOUS + LHTRD + LLWORK = LWORK - INDWK + 1 + +* RWORK(INDRD:INDRD+N-1) stores the real tridiagonal's diagonal +* entries. + INDRD = 1 +* RWORK(INDRE:INDRE+N-1) stores the off-diagonal entries of the +* tridiagonal matrix from CHETRD. + INDRE = INDRD + N +* RWORK(INDRDD:INDRDD+N-1) is a copy of the diagonal entries over +* -written by CSTEMR (the SSTERF path copies the diagonal to W). + INDRDD = INDRE + N +* RWORK(INDREE:INDREE+N-1) is a copy of the off-diagonal entries over +* -written while computing the eigenvalues in SSTERF and CSTEMR. + INDREE = INDRDD + N +* INDRWK is the starting offset of the left-over real workspace, and +* LLRWORK is the remaining workspace size. + INDRWK = INDREE + N + LLRWORK = LRWORK - INDRWK + 1 + +* IWORK(INDIBL:INDIBL+M-1) corresponds to IBLOCK in SSTEBZ and +* stores the block indices of each of the M<=N eigenvalues. + INDIBL = 1 +* IWORK(INDISP:INDISP+NSPLIT-1) corresponds to ISPLIT in SSTEBZ and +* stores the starting and finishing indices of each block. + INDISP = INDIBL + N +* IWORK(INDIFL:INDIFL+N-1) stores the indices of eigenvectors +* that corresponding to eigenvectors that fail to converge in +* CSTEIN. This information is discarded; if any fail, the driver +* returns INFO > 0. + INDIFL = INDISP + N +* INDIWO is the offset of the remaining integer workspace. + INDIWO = INDIFL + N + +* +* Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. +* + CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), + $ RWORK( INDRE ), WORK( INDTAU ), + $ WORK( INDHOUS ), LHTRD, + $ WORK( INDWK ), LLWORK, IINFO ) +* +* If all eigenvalues are desired +* then call SSTERF or CSTEMR and CUNMTR. +* + TEST = .FALSE. + IF( INDEIG ) THEN + IF( IL.EQ.1 .AND. IU.EQ.N ) THEN + TEST = .TRUE. + END IF + END IF + IF( ( ALLEIG.OR.TEST ) .AND. ( IEEEOK.EQ.1 ) ) THEN + IF( .NOT.WANTZ ) THEN + CALL SCOPY( N, RWORK( INDRD ), 1, W, 1 ) + CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 ) + CALL SSTERF( N, W, RWORK( INDREE ), INFO ) + ELSE + CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 ) + CALL SCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 ) +* + IF (ABSTOL .LE. TWO*N*EPS) THEN + TRYRAC = .TRUE. + ELSE + TRYRAC = .FALSE. + END IF + CALL CSTEMR( JOBZ, 'A', N, RWORK( INDRDD ), + $ RWORK( INDREE ), VL, VU, IL, IU, M, W, + $ Z, LDZ, N, ISUPPZ, TRYRAC, + $ RWORK( INDRWK ), LLRWORK, + $ IWORK, LIWORK, INFO ) +* +* Apply unitary matrix used in reduction to tridiagonal +* form to eigenvectors returned by CSTEMR. +* + IF( WANTZ .AND. INFO.EQ.0 ) THEN + INDWKN = INDWK + LLWRKN = LWORK - INDWKN + 1 + CALL CUNMTR( 'L', UPLO, 'N', N, M, A, LDA, + $ WORK( INDTAU ), Z, LDZ, WORK( INDWKN ), + $ LLWRKN, IINFO ) + END IF + END IF +* +* + IF( INFO.EQ.0 ) THEN + M = N + GO TO 30 + END IF + INFO = 0 + END IF +* +* Otherwise, call SSTEBZ and, if eigenvectors are desired, CSTEIN. +* Also call SSTEBZ and CSTEIN if CSTEMR fails. +* + IF( WANTZ ) THEN + ORDER = 'B' + ELSE + ORDER = 'E' + END IF + + CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, + $ RWORK( INDRD ), RWORK( INDRE ), M, NSPLIT, W, + $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ), + $ IWORK( INDIWO ), INFO ) +* + IF( WANTZ ) THEN + CALL CSTEIN( N, RWORK( INDRD ), RWORK( INDRE ), M, W, + $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ RWORK( INDRWK ), IWORK( INDIWO ), IWORK( INDIFL ), + $ INFO ) +* +* Apply unitary matrix used in reduction to tridiagonal +* form to eigenvectors returned by CSTEIN. +* + INDWKN = INDWK + LLWRKN = LWORK - INDWKN + 1 + CALL CUNMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z, + $ LDZ, WORK( INDWKN ), LLWRKN, IINFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + 30 CONTINUE + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = M + ELSE + IMAX = INFO - 1 + END IF + CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* If eigenvalues are not in order, then sort them, along with +* eigenvectors. +* + IF( WANTZ ) THEN + DO 50 J = 1, M - 1 + I = 0 + TMP1 = W( J ) + DO 40 JJ = J + 1, M + IF( W( JJ ).LT.TMP1 ) THEN + I = JJ + TMP1 = W( JJ ) + END IF + 40 CONTINUE +* + IF( I.NE.0 ) THEN + ITMP1 = IWORK( INDIBL+I-1 ) + W( I ) = W( J ) + IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + W( J ) = TMP1 + IWORK( INDIBL+J-1 ) = ITMP1 + CALL CSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) + END IF + 50 CONTINUE + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN + IWORK( 1 ) = LIWMIN +* + RETURN +* +* End of CHEEVR_2STAGE +* + END diff --git a/SRC/cheevx_2stage.f b/SRC/cheevx_2stage.f new file mode 100644 index 00000000..84ae438d --- /dev/null +++ b/SRC/cheevx_2stage.f @@ -0,0 +1,618 @@ +*> \brief <b> CHEEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b> +* +* @generated from zheevx_2stage.f, fortran z -> c, Sat Nov 5 23:18:09 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CHEEVX_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cheevx_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cheevx_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cheevx_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CHEEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, +* IL, IU, ABSTOL, M, W, Z, LDZ, WORK, +* LWORK, RWORK, IWORK, IFAIL, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, RANGE, UPLO +* INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N +* REAL ABSTOL, VL, VU +* .. +* .. Array Arguments .. +* INTEGER IFAIL( * ), IWORK( * ) +* REAL RWORK( * ), W( * ) +* COMPLEX A( LDA, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CHEEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors +*> of a complex Hermitian matrix A using the 2stage technique for +*> the reduction to tridiagonal. Eigenvalues and eigenvectors can +*> be selected by specifying either a range of values or a range of +*> indices for the desired eigenvalues. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] RANGE +*> \verbatim +*> RANGE is CHARACTER*1 +*> = 'A': all eigenvalues will be found. +*> = 'V': all eigenvalues in the half-open interval (VL,VU] +*> will be found. +*> = 'I': the IL-th through IU-th eigenvalues will be found. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA, N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, the lower triangle (if UPLO='L') or the upper +*> triangle (if UPLO='U') of A, including the diagonal, is +*> destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in] VL +*> \verbatim +*> VL is REAL +*> If RANGE='V', the lower bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] VU +*> \verbatim +*> VU is REAL +*> If RANGE='V', the upper bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] IL +*> \verbatim +*> IL is INTEGER +*> If RANGE='I', the index of the +*> smallest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] IU +*> \verbatim +*> IU is INTEGER +*> If RANGE='I', the index of the +*> largest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL +*> The absolute error tolerance for the eigenvalues. +*> An approximate eigenvalue is accepted as converged +*> when it is determined to lie in an interval [a,b] +*> of width less than or equal to +*> +*> ABSTOL + EPS * max( |a|,|b| ) , +*> +*> where EPS is the machine precision. If ABSTOL is less than +*> or equal to zero, then EPS*|T| will be used in its place, +*> where |T| is the 1-norm of the tridiagonal matrix obtained +*> by reducing A to tridiagonal form. +*> +*> Eigenvalues will be computed most accurately when ABSTOL is +*> set to twice the underflow threshold 2*SLAMCH('S'), not zero. +*> If this routine returns with INFO>0, indicating that some +*> eigenvectors did not converge, try setting ABSTOL to +*> 2*SLAMCH('S'). +*> +*> See "Computing Small Singular Values of Bidiagonal Matrices +*> with Guaranteed High Relative Accuracy," by Demmel and +*> Kahan, LAPACK Working Note #3. +*> \endverbatim +*> +*> \param[out] M +*> \verbatim +*> M is INTEGER +*> The total number of eigenvalues found. 0 <= M <= N. +*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> On normal exit, the first M elements contain the selected +*> eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is COMPLEX array, dimension (LDZ, max(1,M)) +*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z +*> contain the orthonormal eigenvectors of the matrix A +*> corresponding to the selected eigenvalues, with the i-th +*> column of Z holding the eigenvector associated with W(i). +*> If an eigenvector fails to converge, then that column of Z +*> contains the latest approximation to the eigenvector, and the +*> index of the eigenvector is returned in IFAIL. +*> If JOBZ = 'N', then Z is not referenced. +*> Note: the user must ensure that at least max(1,M) columns are +*> supplied in the array Z; if RANGE = 'V', the exact value of M +*> is not known in advance and an upper bound must be used. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, 8*N, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (7*N) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (5*N) +*> \endverbatim +*> +*> \param[out] IFAIL +*> \verbatim +*> IFAIL is INTEGER array, dimension (N) +*> If JOBZ = 'V', then if INFO = 0, the first M elements of +*> IFAIL are zero. If INFO > 0, then IFAIL contains the +*> indices of the eigenvectors that failed to converge. +*> If JOBZ = 'N', then IFAIL is not referenced. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, then i eigenvectors failed to converge. +*> Their indices are stored in array IFAIL. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup complexHEeigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CHEEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, + $ IL, IU, ABSTOL, M, W, Z, LDZ, WORK, + $ LWORK, RWORK, IWORK, IFAIL, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, RANGE, UPLO + INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N + REAL ABSTOL, VL, VU +* .. +* .. Array Arguments .. + INTEGER IFAIL( * ), IWORK( * ) + REAL RWORK( * ), W( * ) + COMPLEX A( LDA, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG, + $ WANTZ + CHARACTER ORDER + INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, + $ INDISP, INDIWK, INDRWK, INDTAU, INDWRK, ISCALE, + $ ITMP1, J, JJ, LLWORK, + $ NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS + REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, + $ SIGMA, SMLNUM, TMP1, VLL, VUU +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANHE + EXTERNAL LSAME, ILAENV, SLAMCH, CLANHE +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL, + $ CLACPY, CSTEIN, CSTEQR, CSWAP, CUNGTR, CUNMTR, + $ CHETRD_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LOWER = LSAME( UPLO, 'L' ) + WANTZ = LSAME( JOBZ, 'V' ) + ALLEIG = LSAME( RANGE, 'A' ) + VALEIG = LSAME( RANGE, 'V' ) + INDEIG = LSAME( RANGE, 'I' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN + INFO = -2 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE + IF( VALEIG ) THEN + IF( N.GT.0 .AND. VU.LE.VL ) + $ INFO = -8 + ELSE IF( INDEIG ) THEN + IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN + INFO = -10 + END IF + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -15 + END IF + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + WORK( 1 ) = LWMIN + ELSE + KD = ILAENV( 17, 'CHETRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = N + LHTRD + LWTRD + WORK( 1 ) = LWMIN + END IF +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -17 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHEEVX_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + M = 0 + IF( N.EQ.0 ) THEN + RETURN + END IF +* + IF( N.EQ.1 ) THEN + IF( ALLEIG .OR. INDEIG ) THEN + M = 1 + W( 1 ) = REAL( A( 1, 1 ) ) + ELSE IF( VALEIG ) THEN + IF( VL.LT.REAL( A( 1, 1 ) ) .AND. VU.GE.REAL( A( 1, 1 ) ) ) + $ THEN + M = 1 + W( 1 ) = REAL( A( 1, 1 ) ) + END IF + END IF + IF( WANTZ ) + $ Z( 1, 1 ) = CONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) +* +* Scale matrix to allowable range, if necessary. +* + ISCALE = 0 + ABSTLL = ABSTOL + IF( VALEIG ) THEN + VLL = VL + VUU = VU + END IF + ANRM = CLANHE( 'M', UPLO, N, A, LDA, RWORK ) + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + DO 10 J = 1, N + CALL CSSCAL( N-J+1, SIGMA, A( J, J ), 1 ) + 10 CONTINUE + ELSE + DO 20 J = 1, N + CALL CSSCAL( J, SIGMA, A( 1, J ), 1 ) + 20 CONTINUE + END IF + IF( ABSTOL.GT.0 ) + $ ABSTLL = ABSTOL*SIGMA + IF( VALEIG ) THEN + VLL = VL*SIGMA + VUU = VU*SIGMA + END IF + END IF +* +* Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. +* + INDD = 1 + INDE = INDD + N + INDRWK = INDE + N + INDTAU = 1 + INDHOUS = INDTAU + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDD ), + $ RWORK( INDE ), WORK( INDTAU ), + $ WORK( INDHOUS ), LHTRD, WORK( INDWRK ), + $ LLWORK, IINFO ) +* +* If all eigenvalues are desired and ABSTOL is less than or equal to +* zero, then call SSTERF or CUNGTR and CSTEQR. If this fails for +* some eigenvalue, then try SSTEBZ. +* + TEST = .FALSE. + IF( INDEIG ) THEN + IF( IL.EQ.1 .AND. IU.EQ.N ) THEN + TEST = .TRUE. + END IF + END IF + IF( ( ALLEIG .OR. TEST ) .AND. ( ABSTOL.LE.ZERO ) ) THEN + CALL SCOPY( N, RWORK( INDD ), 1, W, 1 ) + INDEE = INDRWK + 2*N + IF( .NOT.WANTZ ) THEN + CALL SCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 ) + CALL SSTERF( N, W, RWORK( INDEE ), INFO ) + ELSE + CALL CLACPY( 'A', N, N, A, LDA, Z, LDZ ) + CALL CUNGTR( UPLO, N, Z, LDZ, WORK( INDTAU ), + $ WORK( INDWRK ), LLWORK, IINFO ) + CALL SCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 ) + CALL CSTEQR( JOBZ, N, W, RWORK( INDEE ), Z, LDZ, + $ RWORK( INDRWK ), INFO ) + IF( INFO.EQ.0 ) THEN + DO 30 I = 1, N + IFAIL( I ) = 0 + 30 CONTINUE + END IF + END IF + IF( INFO.EQ.0 ) THEN + M = N + GO TO 40 + END IF + INFO = 0 + END IF +* +* Otherwise, call SSTEBZ and, if eigenvectors are desired, CSTEIN. +* + IF( WANTZ ) THEN + ORDER = 'B' + ELSE + ORDER = 'E' + END IF + INDIBL = 1 + INDISP = INDIBL + N + INDIWK = INDISP + N + CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, + $ RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W, + $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ), + $ IWORK( INDIWK ), INFO ) +* + IF( WANTZ ) THEN + CALL CSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W, + $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO ) +* +* Apply unitary matrix used in reduction to tridiagonal +* form to eigenvectors returned by CSTEIN. +* + CALL CUNMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z, + $ LDZ, WORK( INDWRK ), LLWORK, IINFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + 40 CONTINUE + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = M + ELSE + IMAX = INFO - 1 + END IF + CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* If eigenvalues are not in order, then sort them, along with +* eigenvectors. +* + IF( WANTZ ) THEN + DO 60 J = 1, M - 1 + I = 0 + TMP1 = W( J ) + DO 50 JJ = J + 1, M + IF( W( JJ ).LT.TMP1 ) THEN + I = JJ + TMP1 = W( JJ ) + END IF + 50 CONTINUE +* + IF( I.NE.0 ) THEN + ITMP1 = IWORK( INDIBL+I-1 ) + W( I ) = W( J ) + IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + W( J ) = TMP1 + IWORK( INDIBL+J-1 ) = ITMP1 + CALL CSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) + IF( INFO.NE.0 ) THEN + ITMP1 = IFAIL( I ) + IFAIL( I ) = IFAIL( J ) + IFAIL( J ) = ITMP1 + END IF + END IF + 60 CONTINUE + END IF +* +* Set WORK(1) to optimal complex workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of CHEEVX_2STAGE +* + END diff --git a/SRC/chegv_2stage.f b/SRC/chegv_2stage.f new file mode 100644 index 00000000..71d58d74 --- /dev/null +++ b/SRC/chegv_2stage.f @@ -0,0 +1,379 @@ +*> \brief \b CHEGV_2STAGE +* +* @generated from zhegv_2stage.f, fortran z -> c, Sun Nov 6 13:09:52 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CHEGV_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chegv_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chegv_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chegv_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CHEGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, +* WORK, LWORK, RWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, ITYPE, LDA, LDB, LWORK, N +* .. +* .. Array Arguments .. +* REAL RWORK( * ), W( * ) +* COMPLEX A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CHEGV_2STAGE computes all the eigenvalues, and optionally, the eigenvectors +*> of a complex generalized Hermitian-definite eigenproblem, of the form +*> A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. +*> Here A and B are assumed to be Hermitian and B is also +*> positive definite. +*> This routine use the 2stage technique for the reduction to tridiagonal +*> which showed higher performance on recent architecture and for large +* sizes N>2000. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] ITYPE +*> \verbatim +*> ITYPE is INTEGER +*> Specifies the problem type to be solved: +*> = 1: A*x = (lambda)*B*x +*> = 2: A*B*x = (lambda)*x +*> = 3: B*A*x = (lambda)*x +*> \endverbatim +*> +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangles of A and B are stored; +*> = 'L': Lower triangles of A and B are stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrices A and B. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA, N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> +*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the +*> matrix Z of eigenvectors. The eigenvectors are normalized +*> as follows: +*> if ITYPE = 1 or 2, Z**H*B*Z = I; +*> if ITYPE = 3, Z**H*inv(B)*Z = I. +*> If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') +*> or the lower triangle (if UPLO='L') of A, including the +*> diagonal, is destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB, N) +*> On entry, the Hermitian positive definite matrix B. +*> If UPLO = 'U', the leading N-by-N upper triangular part of B +*> contains the upper triangular part of the matrix B. +*> If UPLO = 'L', the leading N-by-N lower triangular part of B +*> contains the lower triangular part of the matrix B. +*> +*> On exit, if INFO <= N, the part of B containing the matrix is +*> overwritten by the triangular factor U or L from the Cholesky +*> factorization B = U**H*U or B = L*L**H. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (max(1, 3*N-2)) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: CPOTRF or CHEEV returned an error code: +*> <= N: if INFO = i, CHEEV failed to converge; +*> i off-diagonal elements of an intermediate +*> tridiagonal form did not converge to zero; +*> > N: if INFO = N + i, for 1 <= i <= N, then the leading +*> minor of order i of B is not positive definite. +*> The factorization of B could not be completed and +*> no eigenvalues or eigenvectors were computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complexHEeigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CHEGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, + $ WORK, LWORK, RWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, ITYPE, LDA, LDB, LWORK, N +* .. +* .. Array Arguments .. + REAL RWORK( * ), W( * ) + COMPLEX A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, UPPER, WANTZ + CHARACTER TRANS + INTEGER NEIG, LWMIN, LHTRD, LWTRD, KD, IB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, CHEGST, CPOTRF, CTRMM, CTRSM, + $ CHEEV_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( ITYPE.LT.1 .OR. ITYPE.GT.3 ) THEN + INFO = -1 + ELSE IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -2 + ELSE IF( .NOT.( UPPER .OR. LSAME( UPLO, 'L' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -8 + END IF +* + IF( INFO.EQ.0 ) THEN + KD = ILAENV( 17, 'CHETRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = N + LHTRD + LWTRD + WORK( 1 ) = LWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -11 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHEGV_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Form a Cholesky factorization of B. +* + CALL CPOTRF( UPLO, N, B, LDB, INFO ) + IF( INFO.NE.0 ) THEN + INFO = N + INFO + RETURN + END IF +* +* Transform problem to standard eigenvalue problem and solve. +* + CALL CHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) + CALL CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, + $ WORK, LWORK, RWORK, INFO ) +* + IF( WANTZ ) THEN +* +* Backtransform eigenvectors to the original problem. +* + NEIG = N + IF( INFO.GT.0 ) + $ NEIG = INFO - 1 + IF( ITYPE.EQ.1 .OR. ITYPE.EQ.2 ) THEN +* +* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; +* backtransform eigenvectors: x = inv(L)**H *y or inv(U)*y +* + IF( UPPER ) THEN + TRANS = 'N' + ELSE + TRANS = 'C' + END IF +* + CALL CTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, + $ B, LDB, A, LDA ) +* + ELSE IF( ITYPE.EQ.3 ) THEN +* +* For B*A*x=(lambda)*x; +* backtransform eigenvectors: x = L*y or U**H *y +* + IF( UPPER ) THEN + TRANS = 'C' + ELSE + TRANS = 'N' + END IF +* + CALL CTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, + $ B, LDB, A, LDA ) + END IF + END IF +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of CHEGV_2STAGE +* + END diff --git a/SRC/chetrd_2stage.f b/SRC/chetrd_2stage.f new file mode 100644 index 00000000..795462c6 --- /dev/null +++ b/SRC/chetrd_2stage.f @@ -0,0 +1,337 @@ +*> \brief \b CHETRD_2STAGE +* +* @generated from zhetrd_2stage.f, fortran z -> c, Sun Nov 6 19:34:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CHETRD_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* HOUS2, LHOUS2, WORK, LWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER VECT, UPLO +* INTEGER N, LDA, LWORK, LHOUS2, INFO +* .. +* .. Array Arguments .. +* REAL D( * ), E( * ) +* COMPLEX A( LDA, * ), TAU( * ), +* HOUS2( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CHETRD_2STAGE reduces a complex Hermitian matrix A to real symmetric +*> tridiagonal form T by a unitary similarity transformation: +*> Q1**H Q2**H* A * Q2 * Q1 = T. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] VECT +*> \verbatim +*> VECT is CHARACTER*1 +*> = 'N': No need for the Housholder representation, +*> in particular for the second stage (Band to +*> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, +*> then LHOUS2 is to be queried and computed. +*> (NOT AVAILABLE IN THIS RELEASE). +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the leading +*> N-by-N upper triangular part of A contains the upper +*> triangular part of the matrix A, and the strictly lower +*> triangular part of A is not referenced. If UPLO = 'L', the +*> leading N-by-N lower triangular part of A contains the lower +*> triangular part of the matrix A, and the strictly upper +*> triangular part of A is not referenced. +*> On exit, if UPLO = 'U', the band superdiagonal +*> of A are overwritten by the corresponding elements of the +*> internal band-diagonal matrix AB, and the elements above +*> the KD superdiagonal, with the array TAU, represent the unitary +*> matrix Q1 as a product of elementary reflectors; if UPLO +*> = 'L', the diagonal and band subdiagonal of A are over- +*> written by the corresponding elements of the internal band-diagonal +*> matrix AB, and the elements below the KD subdiagonal, with +*> the array TAU, represent the unitary matrix Q1 as a product +*> of elementary reflectors. See Further Details. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is REAL array, dimension (N) +*> The diagonal elements of the tridiagonal matrix T. +*> \endverbatim +*> +*> \param[out] E +*> \verbatim +*> E is REAL array, dimension (N-1) +*> The off-diagonal elements of the tridiagonal matrix T. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (N-KD) +*> The scalar factors of the elementary reflectors of +*> the first stage (see Further Details). +*> \endverbatim +*> +*> \param[out] HOUS2 +*> \verbatim +*> HOUS2 is COMPLEX array, dimension LHOUS2, that +*> store the Householder representation of the stage2 +*> band to tridiagonal. +*> \endverbatim +*> +*> \param[in] LHOUS2 +*> \verbatim +*> LHOUS2 is INTEGER +*> The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension) +*> If LWORK = -1, or LHOUS2=-1, +*> then a query is assumed; the routine +*> only calculates the optimal size of the HOUS2 array, returns +*> this value as the first entry of the HOUS2 array, and no error +*> message related to LHOUS2 is issued by XERBLA. +*> LHOUS2 = MAX(1, dimension) where +*> dimension = 4*N if VECT='N' +*> not available now if VECT='H' +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> If LWORK = -1, or LHOUS2=-1, +*> then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complexHEcomputational +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + $ HOUS2, LHOUS2, WORK, LWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER VECT, UPLO + INTEGER N, LDA, LWORK, LHOUS2, INFO +* .. +* .. Array Arguments .. + REAL D( * ), E( * ) + COMPLEX A( LDA, * ), TAU( * ), + $ HOUS2( * ), WORK( * ) +* .. +* +* ===================================================================== +* .. +* .. Local Scalars .. + LOGICAL LQUERY, UPPER, WANTQ + INTEGER KD, IB, LWMIN, LHMIN, LWRK, LDAB, WPOS, ABPOS +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, CHETRD_HE2HB, CHETRD_HB2ST +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + WANTQ = LSAME( VECT, 'V' ) + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS2.EQ.-1 ) +* +* Determine the block size, the workspace size and the hous size. +* + KD = ILAENV( 17, 'CHETRD_2STAGE', VECT, N, -1, -1, -1 ) + IB = ILAENV( 18, 'CHETRD_2STAGE', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV( 19, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV( 20, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) +* WRITE(*,*),'CHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, +* $ LHMIN, LWMIN +* + IF( .NOT.LSAME( VECT, 'N' ) ) THEN + INFO = -1 + ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LHOUS2.LT.LHMIN .AND. .NOT.LQUERY ) THEN + INFO = -10 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -12 + END IF +* + IF( INFO.EQ.0 ) THEN + HOUS2( 1 ) = LHMIN + WORK( 1 ) = LWMIN + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHETRD_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* +* Determine pointer position +* + LDAB = KD+1 + LWRK = LWORK-LDAB*N + ABPOS = 1 + WPOS = ABPOS + LDAB*N + CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + $ TAU, WORK( WPOS ), LWRK, INFO ) + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHETRD_HE2HB', -INFO ) + RETURN + END IF + CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, + $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHETRD_HB2ST', -INFO ) + RETURN + END IF +* +* + HOUS2( 1 ) = LHMIN + WORK( 1 ) = LWMIN + RETURN +* +* End of CHETRD_2STAGE +* + END diff --git a/SRC/chetrd_hb2st.F b/SRC/chetrd_hb2st.F new file mode 100644 index 00000000..6f253278 --- /dev/null +++ b/SRC/chetrd_hb2st.F @@ -0,0 +1,603 @@ +*> \brief \b CHBTRD +* +* @generated from zhetrd_hb2st.F, fortran z -> c, Sun Nov 6 19:34:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CHBTRD + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chbtrd.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chbtrd.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chbtrd.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* D, E, HOUS, LHOUS, WORK, LWORK, INFO ) +* +* #define PRECISION_COMPLEX +* +* #if defined(_OPENMP) +* use omp_lib +* #endif +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER STAGE1, UPLO, VECT +* INTEGER N, KD, IB, LDAB, LHOUS, LWORK, INFO +* .. +* .. Array Arguments .. +* REAL D( * ), E( * ) +* COMPLEX AB( LDAB, * ), HOUS( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CHBTRD reduces a complex Hermitian band matrix A to real symmetric +*> tridiagonal form T by a unitary similarity transformation: +*> Q**H * A * Q = T. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] STAGE +*> \verbatim +*> STAGE is CHARACTER*1 +*> = 'N': "No": to mention that the stage 1 of the reduction +*> from dense to band using the chetrd_he2hb routine +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the chetrd_he2hb +*> routine has been called to produce AB (e.g., AB is +*> the output of chetrd_he2hb. +*> \endverbatim +*> +*> \param[in] VECT +*> \verbatim +*> VECT is CHARACTER*1 +*> = 'N': No need for the Housholder representation, +*> and thus LHOUS is of size max(1, 4*N); +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, +*> then LHOUS is to be queried and computed. +*> (NOT AVAILABLE IN THIS RELEASE). +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is COMPLEX array, dimension (LDAB,N) +*> On entry, the upper or lower triangle of the Hermitian band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> On exit, the diagonal elements of AB are overwritten by the +*> diagonal elements of the tridiagonal matrix T; if KD > 0, the +*> elements on the first superdiagonal (if UPLO = 'U') or the +*> first subdiagonal (if UPLO = 'L') are overwritten by the +*> off-diagonal elements of T; the rest of AB is overwritten by +*> values generated during the reduction. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD+1. +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is REAL array, dimension (N) +*> The diagonal elements of the tridiagonal matrix T. +*> \endverbatim +*> +*> \param[out] E +*> \verbatim +*> E is REAL array, dimension (N-1) +*> The off-diagonal elements of the tridiagonal matrix T: +*> E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'. +*> \endverbatim +*> +*> \param[out] HOUS +*> \verbatim +*> HOUS is COMPLEX array, dimension LHOUS, that +*> store the Householder representation. +*> \endverbatim +*> +*> \param[in] LHOUS +*> \verbatim +*> LHOUS is INTEGER +*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) +*> If LWORK = -1, or LHOUS=-1, +*> then a query is assumed; the routine +*> only calculates the optimal size of the HOUS array, returns +*> this value as the first entry of the HOUS array, and no error +*> message related to LHOUS is issued by XERBLA. +*> LHOUS = MAX(1, dimension) where +*> dimension = 4*N if VECT='N' +*> not available now if VECT='H' +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> If LWORK = -1, or LHOUS=-1, +*> then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complexOTHERcomputational +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) +* +#define PRECISION_COMPLEX +* +#if defined(_OPENMP) + use omp_lib +#endif +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER STAGE1, UPLO, VECT + INTEGER N, KD, LDAB, LHOUS, LWORK, INFO +* .. +* .. Array Arguments .. + REAL D( * ), E( * ) + COMPLEX AB( LDAB, * ), HOUS( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL RZERO + COMPLEX ZERO, ONE + PARAMETER ( RZERO = 0.0E+0, + $ ZERO = ( 0.0E+0, 0.0E+0 ), + $ ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, + $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, + $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, + $ SICEV, SIZETAU, LDV, LHMIN, LWMIN +#if defined (PRECISION_COMPLEX) + REAL ABSTMP + COMPLEX TMP +#endif +* .. +* .. External Subroutines .. + EXTERNAL CHB2ST_KERNELS, CLACPY, CLASET +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN, MAX, CEILING, REAL +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. Executable Statements .. +* +* Determine the minimal workspace size required. +* Test the input parameters +* + DEBUG = 0 + INFO = 0 + AFTERS1 = LSAME( STAGE1, 'Y' ) + WANTQ = LSAME( VECT, 'V' ) + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 ) +* +* Determine the block size, the workspace size and the hous size. +* + IB = ILAENV( 18, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV( 19, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV( 20, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) +* + IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN + INFO = -1 + ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN + INFO = -2 + ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( KD.LT.0 ) THEN + INFO = -5 + ELSE IF( LDAB.LT.(KD+1) ) THEN + INFO = -7 + ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -13 + END IF +* + IF( INFO.EQ.0 ) THEN + HOUS( 1 ) = LHMIN + WORK( 1 ) = LWMIN + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHETRD_HB2ST', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* +* Determine pointer position +* + LDV = KD + IB + SIZETAU = 2 * N + SICEV = 2 * N + INDTAU = 1 + INDV = INDTAU + SIZETAU + LDA = 2 * KD + 1 + SIZEA = LDA * N + INDA = 1 + INDW = INDA + SIZEA + NTHREADS = 1 + TID = 0 +* + IF( UPPER ) THEN + APOS = INDA + KD + AWPOS = INDA + DPOS = APOS + KD + OFDPOS = DPOS - 1 + ABDPOS = KD + 1 + ABOFDPOS = KD + ELSE + APOS = INDA + AWPOS = INDA + KD + 1 + DPOS = APOS + OFDPOS = DPOS + 1 + ABDPOS = 1 + ABOFDPOS = 2 + + ENDIF +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* complex because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or +* in a parallel environment it might need two cores for D and E +* + IF( KD.EQ.0 ) THEN + DO 30 I = 1, N + D( I ) = REAL( AB( ABDPOS, I ) ) + 30 CONTINUE + DO 40 I = 1, N-1 + E( I ) = RZERO + 40 CONTINUE + GOTO 200 + END IF +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal +* and offdiagonal elements real, and store them in D and E. +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler coversion formula might be used, but then +* updating the Q matrix will be required and based if Q is generated +* or not this might complicate the story. +* +C IF( KD.EQ.1 .AND. N.GT.(KD+1) .AND. AFTERS1 ) THEN + IF( KD.EQ.1 ) THEN + DO 50 I = 1, N + D( I ) = REAL( AB( ABDPOS, I ) ) + 50 CONTINUE +#if defined (PRECISION_COMPLEX) +* +* make off-diagonal elements real and copy them to E +* + IF( UPPER ) THEN + DO 60 I = 1, N - 1 + TMP = AB( ABOFDPOS, I+1 ) + ABSTMP = ABS( TMP ) + AB( ABOFDPOS, I+1 ) = ABSTMP + E( I ) = ABSTMP + IF( ABSTMP.NE.RZERO ) THEN + TMP = TMP / ABSTMP + ELSE + TMP = ONE + END IF + IF( I.LT.N-1 ) + $ AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP +C IF( WANTZ ) THEN +C CALL CSCAL( N, CONJG( TMP ), Q( 1, I+1 ), 1 ) +C END IF + 60 CONTINUE + ELSE + DO 70 I = 1, N - 1 + TMP = AB( ABOFDPOS, I ) + ABSTMP = ABS( TMP ) + AB( ABOFDPOS, I ) = ABSTMP + E( I ) = ABSTMP + IF( ABSTMP.NE.RZERO ) THEN + TMP = TMP / ABSTMP + ELSE + TMP = ONE + END IF + IF( I.LT.N-1 ) + $ AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP +C IF( WANTQ ) THEN +C CALL CSCAL( N, TMP, Q( 1, I+1 ), 1 ) +C END IF + 70 CONTINUE + ENDIF +#else + IF( UPPER ) THEN + DO 60 I = 1, N-1 + E( I ) = REAL( AB( ABOFDPOS, I+1 ) ) + 60 CONTINUE + ELSE + DO 70 I = 1, N-1 + E( I ) = REAL( AB( ABOFDPOS, I ) ) + 70 CONTINUE + ENDIF +#endif + GOTO 200 + END IF +* +* Main code start here. +* Reduce the hermitian band of A to a tridiagonal matrix. +* + THGRSIZ = N + GRSIZ = 1 + SHIFT = 3 + NBTILES = CEILING( REAL(N)/REAL(KD) ) + STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) + THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) +* + CALL CLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) + CALL CLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) +* +* +* openMP parallelisation start here +* +#if defined(_OPENMP) +!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) +!$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) +!$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) +!$OMP$ SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT ) +!$OMP MASTER +#endif +* +* main bulge chasing loop +* + DO 100 THGRID = 1, THGRNB + STT = (THGRID-1)*THGRSIZ+1 + THED = MIN( (STT + THGRSIZ -1), (N-1)) + DO 110 I = STT, N-1 + ED = MIN( I, THED ) + IF( STT.GT.ED ) GOTO 100 + DO 120 M = 1, STEPERCOL + ST = STT + DO 130 SWEEPID = ST, ED + DO 140 K = 1, GRSIZ + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + $ + (M-1)*GRSIZ + K + IF ( MYID.EQ.1 ) THEN + TTYPE = 1 + ELSE + TTYPE = MOD( MYID, 2 ) + 2 + ENDIF + + IF( TTYPE.EQ.2 ) THEN + COLPT = (MYID/2)*KD + SWEEPID + STIND = COLPT-KD+1 + EDIND = MIN(COLPT,N) + BLKLASTIND = COLPT + ELSE + COLPT = ((MYID+1)/2)*KD + SWEEPID + STIND = COLPT-KD+1 + EDIND = MIN(COLPT,N) + IF( ( STIND.GE.EDIND-1 ).AND. + $ ( EDIND.EQ.N ) ) THEN + BLKLASTIND = N + ELSE + BLKLASTIND = 0 + ENDIF + ENDIF +* +* Call the kernel +* +#if defined(_OPENMP) + IF( TTYPE.NE.1 ) THEN +!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) +!$OMP$ DEPEND(in:WORK(MYID-1)) +!$OMP$ DEPEND(out:WORK(MYID)) + TID = OMP_GET_THREAD_NUM() + CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + $ STIND, EDIND, SWEEPID, N, KD, IB, + $ WORK ( INDA ), LDA, + $ HOUS( INDV ), HOUS( INDTAU ), LDV, + $ WORK( INDW + TID*KD ) ) +!$OMP END TASK + ELSE +!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) +!$OMP$ DEPEND(out:WORK(MYID)) + TID = OMP_GET_THREAD_NUM() + CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + $ STIND, EDIND, SWEEPID, N, KD, IB, + $ WORK ( INDA ), LDA, + $ HOUS( INDV ), HOUS( INDTAU ), LDV, + $ WORK( INDW + TID*KD ) ) +!$OMP END TASK + ENDIF +#else + CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + $ STIND, EDIND, SWEEPID, N, KD, IB, + $ WORK ( INDA ), LDA, + $ HOUS( INDV ), HOUS( INDTAU ), LDV, + $ WORK( INDW + TID*KD ) ) +#endif + IF ( BLKLASTIND.GE.(N-1) ) THEN + STT = STT + 1 + GOTO 130 + ENDIF + 140 CONTINUE + 130 CONTINUE + 120 CONTINUE + 110 CONTINUE + 100 CONTINUE +* +#if defined(_OPENMP) +!$OMP END MASTER +!$OMP END PARALLEL +#endif +* +* Copy the diagonal from A to D. Note that D is REAL thus only +* the Real part is needed, the imaginary part should be zero. +* + DO 150 I = 1, N + D( I ) = REAL( WORK( DPOS+(I-1)*LDA ) ) + 150 CONTINUE +* +* Copy the off diagonal from A to E. Note that E is REAL thus only +* the Real part is needed, the imaginary part should be zero. +* + IF( UPPER ) THEN + DO 160 I = 1, N-1 + E( I ) = REAL( WORK( OFDPOS+I*LDA ) ) + 160 CONTINUE + ELSE + DO 170 I = 1, N-1 + E( I ) = REAL( WORK( OFDPOS+(I-1)*LDA ) ) + 170 CONTINUE + ENDIF +* + 200 CONTINUE +* + HOUS( 1 ) = LHMIN + WORK( 1 ) = LWMIN + RETURN +* +* End of CHETRD_HB2ST +* + END +#undef PRECISION_COMPLEX + diff --git a/SRC/chetrd_he2hb.f b/SRC/chetrd_he2hb.f new file mode 100644 index 00000000..28f5dc60 --- /dev/null +++ b/SRC/chetrd_he2hb.f @@ -0,0 +1,517 @@ +*> \brief \b CHETRD_HE2HB +* +* @generated from zhetrd_he2hb.f, fortran z -> c, Sun Nov 6 19:34:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CHETRD_HE2HB + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CHETRD_HE2HB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, +* WORK, LWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER UPLO +* INTEGER INFO, LDA, LDAB, LWORK, N, KD +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), AB( LDAB, * ), +* TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CHETRD_HE2HB reduces a complex Hermitian matrix A to complex Hermitian +*> band-diagonal form AB by a unitary similarity transformation: +*> Q**H * A * Q = AB. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the reduced matrix if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> The reduced matrix is stored in the array AB. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the leading +*> N-by-N upper triangular part of A contains the upper +*> triangular part of the matrix A, and the strictly lower +*> triangular part of A is not referenced. If UPLO = 'L', the +*> leading N-by-N lower triangular part of A contains the lower +*> triangular part of the matrix A, and the strictly upper +*> triangular part of A is not referenced. +*> On exit, if UPLO = 'U', the diagonal and first superdiagonal +*> of A are overwritten by the corresponding elements of the +*> tridiagonal matrix T, and the elements above the first +*> superdiagonal, with the array TAU, represent the unitary +*> matrix Q as a product of elementary reflectors; if UPLO +*> = 'L', the diagonal and first subdiagonal of A are over- +*> written by the corresponding elements of the tridiagonal +*> matrix T, and the elements below the first subdiagonal, with +*> the array TAU, represent the unitary matrix Q as a product +*> of elementary reflectors. See Further Details. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] AB +*> \verbatim +*> AB is COMPLEX array, dimension (LDAB,N) +*> On exit, the upper or lower triangle of the Hermitian band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD+1. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (N-KD) +*> The scalar factors of the elementary reflectors (see Further +*> Details). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension LWORK. +*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK(1) returns the size of LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK which should be calculated +* by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> LWORK_QUERY = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD +*> where FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice otherwise +*> putting LWORK=-1 will provide the size of WORK. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complexHEcomputational +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +*> \verbatim +*> +*> If UPLO = 'U', the matrix Q is represented as a product of elementary +*> reflectors +*> +*> Q = H(k)**H . . . H(2)**H H(1)**H, where k = n-kd. +*> +*> Each H(i) has the form +*> +*> H(i) = I - tau * v * v**H +*> +*> where tau is a complex scalar, and v is a complex vector with +*> v(1:i+kd-1) = 0 and v(i+kd) = 1; conjg(v(i+kd+1:n)) is stored on exit in +*> A(i,i+kd+1:n), and tau in TAU(i). +*> +*> If UPLO = 'L', the matrix Q is represented as a product of elementary +*> reflectors +*> +*> Q = H(1) H(2) . . . H(k), where k = n-kd. +*> +*> Each H(i) has the form +*> +*> H(i) = I - tau * v * v**H +*> +*> where tau is a complex scalar, and v is a complex vector with +*> v(kd+1:i) = 0 and v(i+kd+1) = 1; v(i+kd+2:n) is stored on exit in +* A(i+kd+2:n,i), and tau in TAU(i). +*> +*> The contents of A on exit are illustrated by the following examples +*> with n = 5: +*> +*> if UPLO = 'U': if UPLO = 'L': +*> +*> ( ab ab/v1 v1 v1 v1 ) ( ab ) +*> ( ab ab/v2 v2 v2 ) ( ab/v1 ab ) +*> ( ab ab/v3 v3 ) ( v1 ab/v2 ab ) +*> ( ab ab/v4 ) ( v1 v2 ab/v3 ab ) +*> ( ab ) ( v1 v2 v3 ab/v4 ab ) +*> +*> where d and e denote diagonal and off-diagonal elements of T, and vi +*> denotes an element of the vector defining H(i). +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CHETRD_HE2HB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, + $ WORK, LWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, LDAB, LWORK, N, KD +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), AB( LDAB, * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL RONE + COMPLEX ZERO, ONE, HALF + PARAMETER ( RONE = 1.0E+0, + $ ZERO = ( 0.0E+0, 0.0E+0 ), + $ ONE = ( 1.0E+0, 0.0E+0 ), + $ HALF = ( 0.5E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, UPPER + INTEGER I, J, IINFO, LWMIN, PN, PK, LK, + $ LDT, LDW, LDS2, LDS1, + $ LS2, LS1, LW, LT, + $ TPOS, WPOS, S2POS, S1POS +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, CHER2K, CHEMM, CGEMM, + $ CLARFT, CGELQF, CGEQRF, CLASET +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN, MAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. Executable Statements .. +* +* Determine the minimal workspace size required +* and test the input parameters +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) + LWMIN = ILAENV( 20, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) + + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( KD.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LDAB.LT.MAX( 1, KD+1 ) ) THEN + INFO = -7 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CHETRD_HE2HB', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = LWMIN + RETURN + END IF +* +* Quick return if possible +* Copy the upper/lower portion of A into AB +* + IF( N.LE.KD+1 ) THEN + IF( UPPER ) THEN + DO 100 I = 1, N + LK = MIN( KD+1, I ) + CALL CCOPY( LK, A( I-LK+1, I ), 1, + $ AB( KD+1-LK+1, I ), 1 ) + 100 CONTINUE + ELSE + DO 110 I = 1, N + LK = MIN( KD+1, N-I+1 ) + CALL CCOPY( LK, A( I, I ), 1, AB( 1, I ), 1 ) + 110 CONTINUE + ENDIF + WORK( 1 ) = 1 + RETURN + END IF +* +* Determine the pointer position for the workspace +* + LDT = KD + LDS1 = KD + LT = LDT*KD + LW = N*KD + LS1 = LDS1*KD + LS2 = LWMIN - LT - LW - LS1 +* LS2 = N*MAX(KD,FACTOPTNB) + TPOS = 1 + WPOS = TPOS + LT + S1POS = WPOS + LW + S2POS = S1POS + LS1 + IF( UPPER ) THEN + LDW = KD + LDS2 = KD + ELSE + LDW = N + LDS2 = N + ENDIF +* +* +* Set the workspace of the triangular matrix T to zero once such a +* way everytime T is generated the upper/lower portion will be always zero +* + CALL CLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT ) +* + IF( UPPER ) THEN + DO 10 I = 1, N - KD, KD + PN = N-I-KD+1 + PK = MIN( N-I-KD+1, KD ) +* +* Compute the LQ factorization of the current block +* + CALL CGELQF( KD, PN, A( I, I+KD ), LDA, + $ TAU( I ), WORK( S2POS ), LS2, IINFO ) +* +* Copy the upper portion of A into AB +* + DO 20 J = I, I+PK-1 + LK = MIN( KD, N-J ) + 1 + CALL CCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 ) + 20 CONTINUE +* + CALL CLASET( 'Lower', PK, PK, ZERO, ONE, + $ A( I, I+KD ), LDA ) +* +* Form the matrix T +* + CALL CLARFT( 'Forward', 'Rowwise', PN, PK, + $ A( I, I+KD ), LDA, TAU( I ), + $ WORK( TPOS ), LDT ) +* +* Compute W: +* + CALL CGEMM( 'Conjugate', 'No transpose', PK, PN, PK, + $ ONE, WORK( TPOS ), LDT, + $ A( I, I+KD ), LDA, + $ ZERO, WORK( S2POS ), LDS2 ) +* + CALL CHEMM( 'Right', UPLO, PK, PN, + $ ONE, A( I+KD, I+KD ), LDA, + $ WORK( S2POS ), LDS2, + $ ZERO, WORK( WPOS ), LDW ) +* + CALL CGEMM( 'No transpose', 'Conjugate', PK, PK, PN, + $ ONE, WORK( WPOS ), LDW, + $ WORK( S2POS ), LDS2, + $ ZERO, WORK( S1POS ), LDS1 ) +* + CALL CGEMM( 'No transpose', 'No transpose', PK, PN, PK, + $ -HALF, WORK( S1POS ), LDS1, + $ A( I, I+KD ), LDA, + $ ONE, WORK( WPOS ), LDW ) +* +* +* Update the unreduced submatrix A(i+kd:n,i+kd:n), using +* an update of the form: A := A - V'*W - W'*V +* + CALL CHER2K( UPLO, 'Conjugate', PN, PK, + $ -ONE, A( I, I+KD ), LDA, + $ WORK( WPOS ), LDW, + $ RONE, A( I+KD, I+KD ), LDA ) + 10 CONTINUE +* +* Copy the upper band to AB which is the band storage matrix +* + DO 30 J = N-KD+1, N + LK = MIN(KD, N-J) + 1 + CALL CCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 ) + 30 CONTINUE +* + ELSE +* +* Reduce the lower triangle of A to lower band matrix +* + DO 40 I = 1, N - KD, KD + PN = N-I-KD+1 + PK = MIN( N-I-KD+1, KD ) +* +* Compute the QR factorization of the current block +* + CALL CGEQRF( PN, KD, A( I+KD, I ), LDA, + $ TAU( I ), WORK( S2POS ), LS2, IINFO ) +* +* Copy the upper portion of A into AB +* + DO 50 J = I, I+PK-1 + LK = MIN( KD, N-J ) + 1 + CALL CCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 ) + 50 CONTINUE +* + CALL CLASET( 'Upper', PK, PK, ZERO, ONE, + $ A( I+KD, I ), LDA ) +* +* Form the matrix T +* + CALL CLARFT( 'Forward', 'Columnwise', PN, PK, + $ A( I+KD, I ), LDA, TAU( I ), + $ WORK( TPOS ), LDT ) +* +* Compute W: +* + CALL CGEMM( 'No transpose', 'No transpose', PN, PK, PK, + $ ONE, A( I+KD, I ), LDA, + $ WORK( TPOS ), LDT, + $ ZERO, WORK( S2POS ), LDS2 ) +* + CALL CHEMM( 'Left', UPLO, PN, PK, + $ ONE, A( I+KD, I+KD ), LDA, + $ WORK( S2POS ), LDS2, + $ ZERO, WORK( WPOS ), LDW ) +* + CALL CGEMM( 'Conjugate', 'No transpose', PK, PK, PN, + $ ONE, WORK( S2POS ), LDS2, + $ WORK( WPOS ), LDW, + $ ZERO, WORK( S1POS ), LDS1 ) +* + CALL CGEMM( 'No transpose', 'No transpose', PN, PK, PK, + $ -HALF, A( I+KD, I ), LDA, + $ WORK( S1POS ), LDS1, + $ ONE, WORK( WPOS ), LDW ) +* +* +* Update the unreduced submatrix A(i+kd:n,i+kd:n), using +* an update of the form: A := A - V*W' - W*V' +* + CALL CHER2K( UPLO, 'No transpose', PN, PK, + $ -ONE, A( I+KD, I ), LDA, + $ WORK( WPOS ), LDW, + $ RONE, A( I+KD, I+KD ), LDA ) +* ================================================================== +* RESTORE A FOR COMPARISON AND CHECKING TO BE REMOVED +* DO 45 J = I, I+PK-1 +* LK = MIN( KD, N-J ) + 1 +* CALL CCOPY( LK, AB( 1, J ), 1, A( J, J ), 1 ) +* 45 CONTINUE +* ================================================================== + 40 CONTINUE +* +* Copy the lower band to AB which is the band storage matrix +* + DO 60 J = N-KD+1, N + LK = MIN(KD, N-J) + 1 + CALL CCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 ) + 60 CONTINUE + + END IF +* + WORK( 1 ) = LWMIN + RETURN +* +* End of CHETRD_HE2HB +* + END diff --git a/SRC/clarfy.f b/SRC/clarfy.f new file mode 100644 index 00000000..572a4723 --- /dev/null +++ b/SRC/clarfy.f @@ -0,0 +1,163 @@ +*> \brief \b CLARFY +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK ) +* +* .. Scalar Arguments .. +* CHARACTER UPLO +* INTEGER INCV, LDC, N +* COMPLEX TAU +* .. +* .. Array Arguments .. +* COMPLEX C( LDC, * ), V( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLARFY applies an elementary reflector, or Householder matrix, H, +*> to an n x n Hermitian matrix C, from both the left and the right. +*> +*> H is represented in the form +*> +*> H = I - tau * v * v' +*> +*> where tau is a scalar and v is a vector. +*> +*> If tau is zero, then H is taken to be the unit matrix. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the upper or lower triangular part of the +*> Hermitian matrix C is stored. +*> = 'U': Upper triangle +*> = 'L': Lower triangle +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of rows and columns of the matrix C. N >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is COMPLEX array, dimension +*> (1 + (N-1)*abs(INCV)) +*> The vector v as described above. +*> \endverbatim +*> +*> \param[in] INCV +*> \verbatim +*> INCV is INTEGER +*> The increment between successive elements of v. INCV must +*> not be zero. +*> \endverbatim +*> +*> \param[in] TAU +*> \verbatim +*> TAU is COMPLEX +*> The value tau as described above. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX array, dimension (LDC, N) +*> On entry, the matrix C. +*> On exit, C is overwritten by H * C * H'. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max( 1, N ). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (N) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2011 +* +*> \ingroup complex_eig +* +* ===================================================================== + SUBROUTINE CLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK ) +* +* -- LAPACK test routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2011 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCV, LDC, N + COMPLEX TAU +* .. +* .. Array Arguments .. + COMPLEX C( LDC, * ), V( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE, ZERO, HALF + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), + $ ZERO = ( 0.0E+0, 0.0E+0 ), + $ HALF = ( 0.5E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + COMPLEX ALPHA +* .. +* .. External Subroutines .. + EXTERNAL CAXPY, CHEMV, CHER2 +* .. +* .. External Functions .. + COMPLEX CDOTC + EXTERNAL CDOTC +* .. +* .. Executable Statements .. +* + IF( TAU.EQ.ZERO ) + $ RETURN +* +* Form w:= C * v +* + CALL CHEMV( UPLO, N, ONE, C, LDC, V, INCV, ZERO, WORK, 1 ) +* + ALPHA = -HALF*TAU*CDOTC( N, WORK, 1, V, INCV ) + CALL CAXPY( N, ALPHA, V, INCV, WORK, 1 ) +* +* C := C - v * w' - w * v' +* + CALL CHER2( UPLO, N, -TAU, V, INCV, WORK, 1, C, LDC ) +* + RETURN +* +* End of CLARFY +* + END diff --git a/SRC/dlarfy.f b/SRC/dlarfy.f new file mode 100644 index 00000000..089aa94e --- /dev/null +++ b/SRC/dlarfy.f @@ -0,0 +1,161 @@ +*> \brief \b DLARFY +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK ) +* +* .. Scalar Arguments .. +* CHARACTER UPLO +* INTEGER INCV, LDC, N +* DOUBLE PRECISION TAU +* .. +* .. Array Arguments .. +* DOUBLE PRECISION C( LDC, * ), V( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLARFY applies an elementary reflector, or Householder matrix, H, +*> to an n x n symmetric matrix C, from both the left and the right. +*> +*> H is represented in the form +*> +*> H = I - tau * v * v' +*> +*> where tau is a scalar and v is a vector. +*> +*> If tau is zero, then H is taken to be the unit matrix. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the upper or lower triangular part of the +*> symmetric matrix C is stored. +*> = 'U': Upper triangle +*> = 'L': Lower triangle +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of rows and columns of the matrix C. N >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is DOUBLE PRECISION array, dimension +*> (1 + (N-1)*abs(INCV)) +*> The vector v as described above. +*> \endverbatim +*> +*> \param[in] INCV +*> \verbatim +*> INCV is INTEGER +*> The increment between successive elements of v. INCV must +*> not be zero. +*> \endverbatim +*> +*> \param[in] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION +*> The value tau as described above. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (LDC, N) +*> On entry, the matrix C. +*> On exit, C is overwritten by H * C * H'. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max( 1, N ). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (N) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2011 +* +*> \ingroup double_eig +* +* ===================================================================== + SUBROUTINE DLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK ) +* +* -- LAPACK test routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2011 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCV, LDC, N + DOUBLE PRECISION TAU +* .. +* .. Array Arguments .. + DOUBLE PRECISION C( LDC, * ), V( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO, HALF + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0, HALF = 0.5D+0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION ALPHA +* .. +* .. External Subroutines .. + EXTERNAL DAXPY, DSYMV, DSYR2 +* .. +* .. External Functions .. + DOUBLE PRECISION DDOT + EXTERNAL DDOT +* .. +* .. Executable Statements .. +* + IF( TAU.EQ.ZERO ) + $ RETURN +* +* Form w:= C * v +* + CALL DSYMV( UPLO, N, ONE, C, LDC, V, INCV, ZERO, WORK, 1 ) +* + ALPHA = -HALF*TAU*DDOT( N, WORK, 1, V, INCV ) + CALL DAXPY( N, ALPHA, V, INCV, WORK, 1 ) +* +* C := C - v * w' - w * v' +* + CALL DSYR2( UPLO, N, -TAU, V, INCV, WORK, 1, C, LDC ) +* + RETURN +* +* End of DLARFY +* + END diff --git a/SRC/dsb2st_kernels.f b/SRC/dsb2st_kernels.f new file mode 100644 index 00000000..15d1186e --- /dev/null +++ b/SRC/dsb2st_kernels.f @@ -0,0 +1,320 @@ +*> \brief \b DSB2ST_KERNELS +* +* @generated from zhb2st_kernels.f, fortran z -> d, Sun Nov 6 19:34:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DSB2ST_KERNELS + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsb2st_kernels.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsb2st_kernels.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsb2st_kernels.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DSB2ST_KERNELS( UPLO, WANTZ, TTYPE, +* ST, ED, SWEEP, N, NB, IB, +* A, LDA, V, TAU, LDVT, WORK) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER UPLO +* LOGICAL WANTZ +* INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), V( * ), +* TAU( * ), WORK( * ) +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSB2ST_KERNELS is an internal routine used by the DSYTRD_SB2ST +*> subroutine. +*> \endverbatim +* +* Arguments: +* ========== +* +*> @param[in] n +*> The order of the matrix A. +*> +*> @param[in] nb +*> The size of the band. +*> +*> @param[in, out] A +*> A pointer to the matrix A. +*> +*> @param[in] lda +*> The leading dimension of the matrix A. +*> +*> @param[out] V +*> DOUBLE PRECISION array, dimension 2*n if eigenvalues only are +*> requested or to be queried for vectors. +*> +*> @param[out] TAU +*> DOUBLE PRECISION array, dimension (2*n). +*> The scalar factors of the Householder reflectors are stored +*> in this array. +*> +*> @param[in] st +*> internal parameter for indices. +*> +*> @param[in] ed +*> internal parameter for indices. +*> +*> @param[in] sweep +*> internal parameter for indices. +*> +*> @param[in] Vblksiz +*> internal parameter for indices. +*> +*> @param[in] wantz +*> logical which indicate if Eigenvalue are requested or both +*> Eigenvalue/Eigenvectors. +*> +*> @param[in] work +*> Workspace of size nb. +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DSB2ST_KERNELS( UPLO, WANTZ, TTYPE, + $ ST, ED, SWEEP, N, NB, IB, + $ A, LDA, V, TAU, LDVT, WORK) +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER UPLO + LOGICAL WANTZ + INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), V( * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, + $ ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, J1, J2, LM, LN, VPOS, TAUPOS, + $ DPOS, OFDPOS, AJETER + DOUBLE PRECISION CTMP +* .. +* .. External Subroutines .. + EXTERNAL DLARFG, DLARFX, DLARFY +* .. +* .. Intrinsic Functions .. + INTRINSIC MOD +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. +* .. Executable Statements .. +* + AJETER = IB + LDVT + UPPER = LSAME( UPLO, 'U' ) + + IF( UPPER ) THEN + DPOS = 2 * NB + 1 + OFDPOS = 2 * NB + ELSE + DPOS = 1 + OFDPOS = 2 + ENDIF + +* +* Upper case +* + IF( UPPER ) THEN +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ENDIF + GO TO ( 101, 102, 103 ) TTYPE +* + 101 CONTINUE + LM = ED - ST + 1 +* + V( VPOS ) = ONE + DO 10 I = 1, LM-1 + V( VPOS+I ) = ( A( OFDPOS-I, ST+I ) ) + A( OFDPOS-I, ST+I ) = ZERO + 10 CONTINUE + CTMP = ( A( OFDPOS, ST ) ) + CALL DLARFG( LM, CTMP, V( VPOS+1 ), 1, + $ TAU( TAUPOS ) ) + A( OFDPOS, ST ) = CTMP +* + 103 CONTINUE + LM = ED - ST + 1 + CALL DLARFY( UPLO, LM, V( VPOS ), 1, ( TAU( TAUPOS ) ), + $ A( DPOS, ST ), LDA-1, WORK) + GOTO 300 +* + 102 CONTINUE + J1 = ED+1 + J2 = MIN( ED+NB, N ) + LN = ED-ST+1 + LM = J2-J1+1 + IF( LM.GT.0) THEN + CALL DLARFX( 'Left', LN, LM, V( VPOS ), + $ ( TAU( TAUPOS ) ), A( DPOS-NB, J1 ), + $ LDA-1, WORK) +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ENDIF +* + V( VPOS ) = ONE + DO 30 I = 1, LM-1 + V( VPOS+I ) = ( A( DPOS-NB-I, J1+I ) ) + A( DPOS-NB-I, J1+I ) = ZERO + 30 CONTINUE + CTMP = ( A( DPOS-NB, J1 ) ) + CALL DLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) ) + A( DPOS-NB, J1 ) = CTMP +* + CALL DLARFX( 'Right', LN-1, LM, V( VPOS ), + $ TAU( TAUPOS ), + $ A( DPOS-NB+1, J1 ), LDA-1, WORK) + ENDIF + GOTO 300 +* +* Lower case +* + ELSE +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ENDIF + GO TO ( 201, 202, 203 ) TTYPE +* + 201 CONTINUE + LM = ED - ST + 1 +* + V( VPOS ) = ONE + DO 20 I = 1, LM-1 + V( VPOS+I ) = A( OFDPOS+I, ST-1 ) + A( OFDPOS+I, ST-1 ) = ZERO + 20 CONTINUE + CALL DLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, + $ TAU( TAUPOS ) ) +* + 203 CONTINUE + LM = ED - ST + 1 +* + CALL DLARFY( UPLO, LM, V( VPOS ), 1, ( TAU( TAUPOS ) ), + $ A( DPOS, ST ), LDA-1, WORK) + + GOTO 300 +* + 202 CONTINUE + J1 = ED+1 + J2 = MIN( ED+NB, N ) + LN = ED-ST+1 + LM = J2-J1+1 +* + IF( LM.GT.0) THEN + CALL DLARFX( 'Right', LM, LN, V( VPOS ), + $ TAU( TAUPOS ), A( DPOS+NB, ST ), + $ LDA-1, WORK) +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ENDIF +* + V( VPOS ) = ONE + DO 40 I = 1, LM-1 + V( VPOS+I ) = A( DPOS+NB+I, ST ) + A( DPOS+NB+I, ST ) = ZERO + 40 CONTINUE + CALL DLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, + $ TAU( TAUPOS ) ) +* + CALL DLARFX( 'Left', LM, LN-1, V( VPOS ), + $ ( TAU( TAUPOS ) ), + $ A( DPOS+NB-1, ST+1 ), LDA-1, WORK) + + ENDIF + GOTO 300 + ENDIF + + 300 CONTINUE + RETURN +* +* END OF DSB2ST_KERNELS +* + END diff --git a/SRC/dsbev_2stage.f b/SRC/dsbev_2stage.f new file mode 100644 index 00000000..771d29e0 --- /dev/null +++ b/SRC/dsbev_2stage.f @@ -0,0 +1,377 @@ +*> \brief <b> DSBEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b> +* +* @precisions fortran d -> s +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DSBEV_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsbev_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsbev_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsbev_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DSBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, +* WORK, LWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, KD, LDAB, LDZ, N, LWORK +* .. +* .. Array Arguments .. +* DOUBLE PRECISION AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSBEV_2STAGE computes all the eigenvalues and, optionally, eigenvectors of +*> a real symmetric band matrix A using the 2stage technique for +*> the reduction to tridiagonal. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is DOUBLE PRECISION array, dimension (LDAB, N) +*> On entry, the upper or lower triangle of the symmetric band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> +*> On exit, AB is overwritten by values generated during the +*> reduction to tridiagonal form. If UPLO = 'U', the first +*> superdiagonal and the diagonal of the tridiagonal matrix T +*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L', +*> the diagonal and first subdiagonal of T are returned in the +*> first two rows of AB. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD + 1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is DOUBLE PRECISION array, dimension (LDZ, N) +*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal +*> eigenvectors of the matrix A, with the i-th column of Z +*> holding the eigenvector associated with W(i). +*> If JOBZ = 'N', then Z is not referenced. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension LWORK +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS + N +*> where KD is the size of the band. +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the algorithm failed to converge; i +*> off-diagonal elements of an intermediate tridiagonal +*> form did not converge to zero. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup doubleOTHEReigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DSBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, + $ WORK, LWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, KD, LDAB, LDZ, N, LWORK +* .. +* .. Array Arguments .. + DOUBLE PRECISION AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, WANTZ, LQUERY + INTEGER IINFO, IMAX, INDE, INDWRK, ISCALE, + $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS + DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANSB + EXTERNAL LSAME, DLAMCH, DLANSB, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL DLASCL, DSCAL, DSTEQR, DSTERF, XERBLA + $ DSYTRD_SB2ST +* .. +* .. Intrinsic Functions .. + INTRINSIC SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KD.LT.0 ) THEN + INFO = -4 + ELSE IF( LDAB.LT.KD+1 ) THEN + INFO = -6 + ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -9 + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + WORK( 1 ) = LWMIN + ELSE + IB = ILAENV( 18, 'DSYTRD_SB2ST', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 ) + LWMIN = N + LHTRD + LWTRD + WORK( 1 ) = LWMIN + ENDIF +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -11 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSBEV_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + IF( LOWER ) THEN + W( 1 ) = AB( 1, 1 ) + ELSE + W( 1 ) = AB( KD+1, 1 ) + END IF + IF( WANTZ ) + $ Z( 1, 1 ) = ONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = DLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + CALL DLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + ELSE + CALL DLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + END IF + END IF +* +* Call DSYTRD_SB2ST to reduce symmetric band matrix to tridiagonal form. +* + INDE = 1 + INDHOUS = INDE + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL DSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, + $ WORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call DSTERF. For eigenvectors, call SSTEQR. +* + IF( .NOT.WANTZ ) THEN + CALL DSTERF( N, W, WORK( INDE ), INFO ) + ELSE + CALL DSTEQR( JOBZ, N, W, WORK( INDE ), Z, LDZ, WORK( INDWRK ), + $ INFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = N + ELSE + IMAX = INFO - 1 + END IF + CALL DSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of DSBEV_2STAGE +* + END diff --git a/SRC/dsbevd_2stage.f b/SRC/dsbevd_2stage.f new file mode 100644 index 00000000..39074681 --- /dev/null +++ b/SRC/dsbevd_2stage.f @@ -0,0 +1,412 @@ +*> \brief <b> DSBEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b> +* +* @precisions fortran d -> s +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DSBEVD_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsbevd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsbevd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsbevd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DSBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, +* WORK, LWORK, IWORK, LIWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, KD, LDAB, LDZ, LIWORK, LWORK, N +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ) +* DOUBLE PRECISION AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSBEVD_2STAGE computes all the eigenvalues and, optionally, eigenvectors of +*> a real symmetric band matrix A using the 2stage technique for +*> the reduction to tridiagonal. If eigenvectors are desired, it uses +*> a divide and conquer algorithm. +*> +*> The divide and conquer algorithm makes very mild assumptions about +*> floating point arithmetic. It will work on machines with a guard +*> digit in add/subtract, or on those binary machines without guard +*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or +*> Cray-2. It could conceivably fail on hexadecimal or decimal machines +*> without guard digits, but we know of none. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is DOUBLE PRECISION array, dimension (LDAB, N) +*> On entry, the upper or lower triangle of the symmetric band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> +*> On exit, AB is overwritten by values generated during the +*> reduction to tridiagonal form. If UPLO = 'U', the first +*> superdiagonal and the diagonal of the tridiagonal matrix T +*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L', +*> the diagonal and first subdiagonal of T are returned in the +*> first two rows of AB. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD + 1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is DOUBLE PRECISION array, dimension (LDZ, N) +*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal +*> eigenvectors of the matrix A, with the i-th column of Z +*> holding the eigenvector associated with W(i). +*> If JOBZ = 'N', then Z is not referenced. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension LWORK +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS + N +*> where KD is the size of the band. +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK and IWORK +*> arrays, returns these values as the first entries of the WORK +*> and IWORK arrays, and no error message related to LWORK or +*> LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. +*> If JOBZ = 'N' or N <= 1, LIWORK must be at least 1. +*> If JOBZ = 'V' and N > 2, LIWORK must be at least 3 + 5*N. +*> +*> If LIWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK and IWORK arrays, and no error message related to +*> LWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the algorithm failed to converge; i +*> off-diagonal elements of an intermediate tridiagonal +*> form did not converge to zero. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup doubleOTHEReigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DSBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, + $ WORK, LWORK, IWORK, LIWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, KD, LDAB, LDZ, LIWORK, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + DOUBLE PRECISION AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, LQUERY, WANTZ + INTEGER IINFO, INDE, INDWK2, INDWRK, ISCALE, LIWMIN, + $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS, + $ LLWRK2 + DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANSB + EXTERNAL LSAME, DLAMCH, DLANSB, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLACPY, DLASCL, DSCAL, DSTEDC, + $ DSTERF, XERBLA, DSYTRD_SB2ST +* .. +* .. Intrinsic Functions .. + INTRINSIC SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) +* + INFO = 0 + IF( N.LE.1 ) THEN + LIWMIN = 1 + LWMIN = 1 + ELSE + IB = ILAENV( 18, 'DSYTRD_SB2ST', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 ) + IF( WANTZ ) THEN + LIWMIN = 3 + 5*N + LWMIN = 1 + 5*N + 2*N**2 + ELSE + LIWMIN = 1 + LWMIN = MAX( 2*N, N+LHTRD+LWTRD ) + END IF + END IF + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KD.LT.0 ) THEN + INFO = -4 + ELSE IF( LDAB.LT.KD+1 ) THEN + INFO = -6 + ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -9 + END IF +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + IWORK( 1 ) = LIWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN + INFO = -13 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSBEVD_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + W( 1 ) = AB( 1, 1 ) + IF( WANTZ ) + $ Z( 1, 1 ) = ONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = DLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + CALL DLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + ELSE + CALL DLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + END IF + END IF +* +* Call DSYTRD_SB2ST to reduce band symmetric matrix to tridiagonal form. +* + INDE = 1 + INDHOUS = INDE + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 + INDWK2 = INDWRK + N*N + LLWRK2 = LWORK - INDWK2 + 1 +* + CALL DSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, + $ WORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call DSTERF. For eigenvectors, call SSTEDC. +* + IF( .NOT.WANTZ ) THEN + CALL DSTERF( N, W, WORK( INDE ), INFO ) + ELSE + CALL DSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N, + $ WORK( INDWK2 ), LLWRK2, IWORK, LIWORK, INFO ) + CALL DGEMM( 'N', 'N', N, N, N, ONE, Z, LDZ, WORK( INDWRK ), N, + $ ZERO, WORK( INDWK2 ), N ) + CALL DLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) + $ CALL DSCAL( N, ONE / SIGMA, W, 1 ) +* + WORK( 1 ) = LWMIN + IWORK( 1 ) = LIWMIN + RETURN +* +* End of DSBEVD_2STAGE +* + END diff --git a/SRC/dsbevx_2stage.f b/SRC/dsbevx_2stage.f new file mode 100644 index 00000000..3cb3f661 --- /dev/null +++ b/SRC/dsbevx_2stage.f @@ -0,0 +1,633 @@ +*> \brief <b> DSBEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b> +* +* @precisions fortran d -> s +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DSBEVX_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsbevx_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsbevx_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsbevx_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DSBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, Q, +* LDQ, VL, VU, IL, IU, ABSTOL, M, W, Z, +* LDZ, WORK, LWORK, IWORK, IFAIL, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, RANGE, UPLO +* INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK +* DOUBLE PRECISION ABSTOL, VL, VU +* .. +* .. Array Arguments .. +* INTEGER IFAIL( * ), IWORK( * ) +* DOUBLE PRECISION AB( LDAB, * ), Q( LDQ, * ), W( * ), WORK( * ), +* $ Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSBEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors +*> of a real symmetric band matrix A using the 2stage technique for +*> the reduction to tridiagonal. Eigenvalues and eigenvectors can +*> be selected by specifying either a range of values or a range of +*> indices for the desired eigenvalues. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] RANGE +*> \verbatim +*> RANGE is CHARACTER*1 +*> = 'A': all eigenvalues will be found; +*> = 'V': all eigenvalues in the half-open interval (VL,VU] +*> will be found; +*> = 'I': the IL-th through IU-th eigenvalues will be found. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is DOUBLE PRECISION array, dimension (LDAB, N) +*> On entry, the upper or lower triangle of the symmetric band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> +*> On exit, AB is overwritten by values generated during the +*> reduction to tridiagonal form. If UPLO = 'U', the first +*> superdiagonal and the diagonal of the tridiagonal matrix T +*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L', +*> the diagonal and first subdiagonal of T are returned in the +*> first two rows of AB. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD + 1. +*> \endverbatim +*> +*> \param[out] Q +*> \verbatim +*> Q is DOUBLE PRECISION array, dimension (LDQ, N) +*> If JOBZ = 'V', the N-by-N orthogonal matrix used in the +*> reduction to tridiagonal form. +*> If JOBZ = 'N', the array Q is not referenced. +*> \endverbatim +*> +*> \param[in] LDQ +*> \verbatim +*> LDQ is INTEGER +*> The leading dimension of the array Q. If JOBZ = 'V', then +*> LDQ >= max(1,N). +*> \endverbatim +*> +*> \param[in] VL +*> \verbatim +*> VL is DOUBLE PRECISION +*> If RANGE='V', the lower bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] VU +*> \verbatim +*> VU is DOUBLE PRECISION +*> If RANGE='V', the upper bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] IL +*> \verbatim +*> IL is INTEGER +*> If RANGE='I', the index of the +*> smallest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] IU +*> \verbatim +*> IU is INTEGER +*> If RANGE='I', the index of the +*> largest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION +*> The absolute error tolerance for the eigenvalues. +*> An approximate eigenvalue is accepted as converged +*> when it is determined to lie in an interval [a,b] +*> of width less than or equal to +*> +*> ABSTOL + EPS * max( |a|,|b| ) , +*> +*> where EPS is the machine precision. If ABSTOL is less than +*> or equal to zero, then EPS*|T| will be used in its place, +*> where |T| is the 1-norm of the tridiagonal matrix obtained +*> by reducing AB to tridiagonal form. +*> +*> Eigenvalues will be computed most accurately when ABSTOL is +*> set to twice the underflow threshold 2*DLAMCH('S'), not zero. +*> If this routine returns with INFO>0, indicating that some +*> eigenvectors did not converge, try setting ABSTOL to +*> 2*DLAMCH('S'). +*> +*> See "Computing Small Singular Values of Bidiagonal Matrices +*> with Guaranteed High Relative Accuracy," by Demmel and +*> Kahan, LAPACK Working Note #3. +*> \endverbatim +*> +*> \param[out] M +*> \verbatim +*> M is INTEGER +*> The total number of eigenvalues found. 0 <= M <= N. +*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> The first M elements contain the selected eigenvalues in +*> ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is DOUBLE PRECISION array, dimension (LDZ, max(1,M)) +*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z +*> contain the orthonormal eigenvectors of the matrix A +*> corresponding to the selected eigenvalues, with the i-th +*> column of Z holding the eigenvector associated with W(i). +*> If an eigenvector fails to converge, then that column of Z +*> contains the latest approximation to the eigenvector, and the +*> index of the eigenvector is returned in IFAIL. +*> If JOBZ = 'N', then Z is not referenced. +*> Note: the user must ensure that at least max(1,M) columns are +*> supplied in the array Z; if RANGE = 'V', the exact value of M +*> is not known in advance and an upper bound must be used. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, 7*N, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS + 2*N +*> where KD is the size of the band. +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (5*N) +*> \endverbatim +*> +*> \param[out] IFAIL +*> \verbatim +*> IFAIL is INTEGER array, dimension (N) +*> If JOBZ = 'V', then if INFO = 0, the first M elements of +*> IFAIL are zero. If INFO > 0, then IFAIL contains the +*> indices of the eigenvectors that failed to converge. +*> If JOBZ = 'N', then IFAIL is not referenced. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> > 0: if INFO = i, then i eigenvectors failed to converge. +*> Their indices are stored in array IFAIL. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup doubleOTHEReigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DSBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, Q, + $ LDQ, VL, VU, IL, IU, ABSTOL, M, W, Z, + $ LDZ, WORK, LWORK, IWORK, IFAIL, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, RANGE, UPLO + INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK + DOUBLE PRECISION ABSTOL, VL, VU +* .. +* .. Array Arguments .. + INTEGER IFAIL( * ), IWORK( * ) + DOUBLE PRECISION AB( LDAB, * ), Q( LDQ, * ), W( * ), WORK( * ), + $ Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. +* .. Local Scalars .. + LOGICAL ALLEIG, INDEIG, LOWER, TEST, VALEIG, WANTZ, + $ LQUERY + CHARACTER ORDER + INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, + $ INDISP, INDIWO, INDWRK, ISCALE, ITMP1, J, JJ, + $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS, + $ NSPLIT + DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, + $ SIGMA, SMLNUM, TMP1, VLL, VUU +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANSB + EXTERNAL LSAME, DLAMCH, DLANSB, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DGEMV, DLACPY, DLASCL, DSCAL, + $ DSTEBZ, DSTEIN, DSTEQR, DSTERF, DSWAP, XERBLA, + $ DSYTRD_SB2ST +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + ALLEIG = LSAME( RANGE, 'A' ) + VALEIG = LSAME( RANGE, 'V' ) + INDEIG = LSAME( RANGE, 'I' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN + INFO = -2 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( KD.LT.0 ) THEN + INFO = -5 + ELSE IF( LDAB.LT.KD+1 ) THEN + INFO = -7 + ELSE IF( WANTZ .AND. LDQ.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE + IF( VALEIG ) THEN + IF( N.GT.0 .AND. VU.LE.VL ) + $ INFO = -11 + ELSE IF( INDEIG ) THEN + IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN + INFO = -12 + ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN + INFO = -13 + END IF + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) + $ INFO = -18 + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + WORK( 1 ) = LWMIN + ELSE + IB = ILAENV( 18, 'DSYTRD_SB2ST', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'DSYTRD_SB2ST', JOBZ, N, KD, IB, -1 ) + LWMIN = 2*N + LHTRD + LWTRD + WORK( 1 ) = LWMIN + ENDIF +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -20 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSBEVX_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + M = 0 + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + M = 1 + IF( LOWER ) THEN + TMP1 = AB( 1, 1 ) + ELSE + TMP1 = AB( KD+1, 1 ) + END IF + IF( VALEIG ) THEN + IF( .NOT.( VL.LT.TMP1 .AND. VU.GE.TMP1 ) ) + $ M = 0 + END IF + IF( M.EQ.1 ) THEN + W( 1 ) = TMP1 + IF( WANTZ ) + $ Z( 1, 1 ) = ONE + END IF + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) +* +* Scale matrix to allowable range, if necessary. +* + ISCALE = 0 + ABSTLL = ABSTOL + IF( VALEIG ) THEN + VLL = VL + VUU = VU + ELSE + VLL = ZERO + VUU = ZERO + END IF + ANRM = DLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK ) + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + CALL DLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + ELSE + CALL DLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + END IF + IF( ABSTOL.GT.0 ) + $ ABSTLL = ABSTOL*SIGMA + IF( VALEIG ) THEN + VLL = VL*SIGMA + VUU = VU*SIGMA + END IF + END IF +* +* Call DSYTRD_SB2ST to reduce symmetric band matrix to tridiagonal form. +* + INDD = 1 + INDE = INDD + N + INDHOUS = INDE + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL DSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, WORK( INDD ), + $ WORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* If all eigenvalues are desired and ABSTOL is less than or equal +* to zero, then call DSTERF or SSTEQR. If this fails for some +* eigenvalue, then try DSTEBZ. +* + TEST = .FALSE. + IF (INDEIG) THEN + IF (IL.EQ.1 .AND. IU.EQ.N) THEN + TEST = .TRUE. + END IF + END IF + IF ((ALLEIG .OR. TEST) .AND. (ABSTOL.LE.ZERO)) THEN + CALL DCOPY( N, WORK( INDD ), 1, W, 1 ) + INDEE = INDWRK + 2*N + IF( .NOT.WANTZ ) THEN + CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 ) + CALL DSTERF( N, W, WORK( INDEE ), INFO ) + ELSE + CALL DLACPY( 'A', N, N, Q, LDQ, Z, LDZ ) + CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 ) + CALL DSTEQR( JOBZ, N, W, WORK( INDEE ), Z, LDZ, + $ WORK( INDWRK ), INFO ) + IF( INFO.EQ.0 ) THEN + DO 10 I = 1, N + IFAIL( I ) = 0 + 10 CONTINUE + END IF + END IF + IF( INFO.EQ.0 ) THEN + M = N + GO TO 30 + END IF + INFO = 0 + END IF +* +* Otherwise, call DSTEBZ and, if eigenvectors are desired, SSTEIN. +* + IF( WANTZ ) THEN + ORDER = 'B' + ELSE + ORDER = 'E' + END IF + INDIBL = 1 + INDISP = INDIBL + N + INDIWO = INDISP + N + CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, + $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W, + $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ), + $ IWORK( INDIWO ), INFO ) +* + IF( WANTZ ) THEN + CALL DSTEIN( N, WORK( INDD ), WORK( INDE ), M, W, + $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO ) +* +* Apply orthogonal matrix used in reduction to tridiagonal +* form to eigenvectors returned by DSTEIN. +* + DO 20 J = 1, M + CALL DCOPY( N, Z( 1, J ), 1, WORK( 1 ), 1 ) + CALL DGEMV( 'N', N, N, ONE, Q, LDQ, WORK, 1, ZERO, + $ Z( 1, J ), 1 ) + 20 CONTINUE + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + 30 CONTINUE + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = M + ELSE + IMAX = INFO - 1 + END IF + CALL DSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* If eigenvalues are not in order, then sort them, along with +* eigenvectors. +* + IF( WANTZ ) THEN + DO 50 J = 1, M - 1 + I = 0 + TMP1 = W( J ) + DO 40 JJ = J + 1, M + IF( W( JJ ).LT.TMP1 ) THEN + I = JJ + TMP1 = W( JJ ) + END IF + 40 CONTINUE +* + IF( I.NE.0 ) THEN + ITMP1 = IWORK( INDIBL+I-1 ) + W( I ) = W( J ) + IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + W( J ) = TMP1 + IWORK( INDIBL+J-1 ) = ITMP1 + CALL DSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) + IF( INFO.NE.0 ) THEN + ITMP1 = IFAIL( I ) + IFAIL( I ) = IFAIL( J ) + IFAIL( J ) = ITMP1 + END IF + END IF + 50 CONTINUE + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of DSBEVX_2STAGE +* + END diff --git a/SRC/dsyev_2stage.f b/SRC/dsyev_2stage.f new file mode 100644 index 00000000..a42e86d8 --- /dev/null +++ b/SRC/dsyev_2stage.f @@ -0,0 +1,348 @@ +*> \brief <b> DSYEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b> +* +* @precisions fortran d -> s +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DSYEV_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsyevd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsyevd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsyevd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSYEV_2STAGE computes all eigenvalues and, optionally, eigenvectors of a +*> real symmetric matrix A using the 2stage technique for +*> the reduction to tridiagonal. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA, N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the +*> orthonormal eigenvectors of the matrix A. +*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') +*> or the upper triangle (if UPLO='U') of A, including the +*> diagonal, is destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension LWORK +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + 2*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + 2*N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the algorithm failed to converge; i +*> off-diagonal elements of an intermediate tridiagonal +*> form did not converge to zero. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup doubleSYeigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + $ INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, LQUERY, WANTZ + INTEGER IINFO, IMAX, INDE, INDTAU, INDWRK, ISCALE, + $ LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS + DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANSY + EXTERNAL LSAME, ILAENV, DLAMCH, DLANSY +* .. +* .. External Subroutines .. + EXTERNAL DLASCL, DORGTR, DSCAL, DSTEQR, DSTERF, + $ XERBLA, DSYTRD_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF +* + IF( INFO.EQ.0 ) THEN + KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = 2*N + LHTRD + LWTRD + WORK( 1 ) = LWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -8 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSYEV_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) THEN + RETURN + END IF +* + IF( N.EQ.1 ) THEN + W( 1 ) = A( 1, 1 ) + WORK( 1 ) = 2 + IF( WANTZ ) + $ A( 1, 1 ) = ONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = DLANSY( 'M', UPLO, N, A, LDA, WORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) + $ CALL DLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO ) +* +* Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. +* + INDE = 1 + INDTAU = INDE + N + INDHOUS = INDTAU + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ), + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call DSTERF. For eigenvectors, first call +* DORGTR to generate the orthogonal matrix, then call DSTEQR. +* + IF( .NOT.WANTZ ) THEN + CALL DSTERF( N, W, WORK( INDE ), INFO ) + ELSE +* Not available in this release, and agrument checking should not +* let it getting here + RETURN + CALL DORGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ), + $ LLWORK, IINFO ) + CALL DSTEQR( JOBZ, N, W, WORK( INDE ), A, LDA, WORK( INDTAU ), + $ INFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = N + ELSE + IMAX = INFO - 1 + END IF + CALL DSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of DSYEV_2STAGE +* + END diff --git a/SRC/dsyevd_2stage.f b/SRC/dsyevd_2stage.f new file mode 100644 index 00000000..161f0e9e --- /dev/null +++ b/SRC/dsyevd_2stage.f @@ -0,0 +1,406 @@ +*> \brief <b> DSYEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b> +* +* @precisions fortran d -> s +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DSYEVD_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsyevd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsyevd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsyevd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DSYEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* IWORK, LIWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, LDA, LIWORK, LWORK, N +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ) +* DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSYEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a +*> real symmetric matrix A using the 2stage technique for +*> the reduction to tridiagonal. If eigenvectors are desired, it uses a +*> divide and conquer algorithm. +*> +*> The divide and conquer algorithm makes very mild assumptions about +*> floating point arithmetic. It will work on machines with a guard +*> digit in add/subtract, or on those binary machines without guard +*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or +*> Cray-2. It could conceivably fail on hexadecimal or decimal machines +*> without guard digits, but we know of none. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA, N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the +*> orthonormal eigenvectors of the matrix A. +*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') +*> or the upper triangle (if UPLO='U') of A, including the +*> diagonal, is destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, +*> dimension (LWORK) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + 2*N+1 +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + 2*N+1 +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be at least +*> 1 + 6*N + 2*N**2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK and IWORK +*> arrays, returns these values as the first entries of the WORK +*> and IWORK arrays, and no error message related to LWORK or +*> LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK must be at least 1. +*> If JOBZ = 'N' and N > 1, LIWORK must be at least 1. +*> If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N. +*> +*> If LIWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK and IWORK arrays, and no error message related to +*> LWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i and JOBZ = 'N', then the algorithm failed +*> to converge; i off-diagonal elements of an intermediate +*> tridiagonal form did not converge to zero; +*> if INFO = i and JOBZ = 'V', then the algorithm failed +*> to compute an eigenvalue while working on the submatrix +*> lying in rows and columns INFO/(N+1) through +*> mod(INFO,N+1). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date September 2012 +* +*> \ingroup doubleSYeigen +* +*> \par Contributors: +* ================== +*> +*> Jeff Rutter, Computer Science Division, University of California +*> at Berkeley, USA \n +*> Modified by Francoise Tisseur, University of Tennessee \n +*> Modified description of INFO. Sven, 16 Feb 05. \n +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DSYEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + $ IWORK, LIWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.4.2) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* September 2012 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, LDA, LIWORK, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. +* + LOGICAL LOWER, LQUERY, WANTZ + INTEGER IINFO, INDE, INDTAU, INDWK2, INDWRK, ISCALE, + $ LIWMIN, LLWORK, LLWRK2, LWMIN, + $ LHTRD, LWTRD, KD, IB, INDHOUS + DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANSY + EXTERNAL LSAME, DLAMCH, DLANSY, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL DLACPY, DLASCL, DORMTR, DSCAL, DSTEDC, DSTERF, + $ DSYTRD_2STAGE, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LIWMIN = 1 + LWMIN = 1 + ELSE + KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + IF( WANTZ ) THEN + LIWMIN = 3 + 5*N + LWMIN = 1 + 6*N + 2*N**2 + ELSE + LIWMIN = 1 + LWMIN = 2*N + 1 + LHTRD + LWTRD + END IF + END IF + WORK( 1 ) = LWMIN + IWORK( 1 ) = LIWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -8 + ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSYEVD_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + W( 1 ) = A( 1, 1 ) + IF( WANTZ ) + $ A( 1, 1 ) = ONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = DLANSY( 'M', UPLO, N, A, LDA, WORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) + $ CALL DLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO ) +* +* Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. +* + INDE = 1 + INDTAU = INDE + N + INDHOUS = INDTAU + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 + INDWK2 = INDWRK + N*N + LLWRK2 = LWORK - INDWK2 + 1 +* + CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ), + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call DSTERF. For eigenvectors, first call +* DSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the +* tridiagonal matrix, then call DORMTR to multiply it by the +* Householder transformations stored in A. +* + IF( .NOT.WANTZ ) THEN + CALL DSTERF( N, W, WORK( INDE ), INFO ) + ELSE +* Not available in this release, and agrument checking should not +* let it getting here + RETURN + CALL DSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N, + $ WORK( INDWK2 ), LLWRK2, IWORK, LIWORK, INFO ) + CALL DORMTR( 'L', UPLO, 'N', N, N, A, LDA, WORK( INDTAU ), + $ WORK( INDWRK ), N, WORK( INDWK2 ), LLWRK2, IINFO ) + CALL DLACPY( 'A', N, N, WORK( INDWRK ), N, A, LDA ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) + $ CALL DSCAL( N, ONE / SIGMA, W, 1 ) +* + WORK( 1 ) = LWMIN + IWORK( 1 ) = LIWMIN +* + RETURN +* +* End of DSYEVD_2STAGE +* + END diff --git a/SRC/dsyevr_2stage.f b/SRC/dsyevr_2stage.f new file mode 100644 index 00000000..c1b468dc --- /dev/null +++ b/SRC/dsyevr_2stage.f @@ -0,0 +1,740 @@ +*> \brief <b> DSYEVR_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b> +* +* @precisions fortran d -> s +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DSYEVR_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsyevr_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsyevr_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsyevr_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DSYEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, +* IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, WORK, +* LWORK, IWORK, LIWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, RANGE, UPLO +* INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LWORK, M, N +* DOUBLE PRECISION ABSTOL, VL, VU +* .. +* .. Array Arguments .. +* INTEGER ISUPPZ( * ), IWORK( * ) +* DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSYEVR_2STAGE computes selected eigenvalues and, optionally, eigenvectors +*> of a real symmetric matrix A using the 2stage technique for +*> the reduction to tridiagonal. Eigenvalues and eigenvectors can be +*> selected by specifying either a range of values or a range of +*> indices for the desired eigenvalues. +*> +*> DSYEVR_2STAGE first reduces the matrix A to tridiagonal form T with a call +*> to DSYTRD. Then, whenever possible, DSYEVR_2STAGE calls DSTEMR to compute +*> the eigenspectrum using Relatively Robust Representations. DSTEMR +*> computes eigenvalues by the dqds algorithm, while orthogonal +*> eigenvectors are computed from various "good" L D L^T representations +*> (also known as Relatively Robust Representations). Gram-Schmidt +*> orthogonalization is avoided as far as possible. More specifically, +*> the various steps of the algorithm are as follows. +*> +*> For each unreduced block (submatrix) of T, +*> (a) Compute T - sigma I = L D L^T, so that L and D +*> define all the wanted eigenvalues to high relative accuracy. +*> This means that small relative changes in the entries of D and L +*> cause only small relative changes in the eigenvalues and +*> eigenvectors. The standard (unfactored) representation of the +*> tridiagonal matrix T does not have this property in general. +*> (b) Compute the eigenvalues to suitable accuracy. +*> If the eigenvectors are desired, the algorithm attains full +*> accuracy of the computed eigenvalues only right before +*> the corresponding vectors have to be computed, see steps c) and d). +*> (c) For each cluster of close eigenvalues, select a new +*> shift close to the cluster, find a new factorization, and refine +*> the shifted eigenvalues to suitable accuracy. +*> (d) For each eigenvalue with a large enough relative separation compute +*> the corresponding eigenvector by forming a rank revealing twisted +*> factorization. Go back to (c) for any clusters that remain. +*> +*> The desired accuracy of the output can be specified by the input +*> parameter ABSTOL. +*> +*> For more details, see DSTEMR's documentation and: +*> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations +*> to compute orthogonal eigenvectors of symmetric tridiagonal matrices," +*> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004. +*> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and +*> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25, +*> 2004. Also LAPACK Working Note 154. +*> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric +*> tridiagonal eigenvalue/eigenvector problem", +*> Computer Science Division Technical Report No. UCB/CSD-97-971, +*> UC Berkeley, May 1997. +*> +*> +*> Note 1 : DSYEVR_2STAGE calls DSTEMR when the full spectrum is requested +*> on machines which conform to the ieee-754 floating point standard. +*> DSYEVR_2STAGE calls DSTEBZ and SSTEIN on non-ieee machines and +*> when partial spectrum requests are made. +*> +*> Normal execution of DSTEMR may create NaNs and infinities and +*> hence may abort due to a floating point exception in environments +*> which do not handle NaNs and infinities in the ieee standard default +*> manner. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] RANGE +*> \verbatim +*> RANGE is CHARACTER*1 +*> = 'A': all eigenvalues will be found. +*> = 'V': all eigenvalues in the half-open interval (VL,VU] +*> will be found. +*> = 'I': the IL-th through IU-th eigenvalues will be found. +*> For RANGE = 'V' or 'I' and IU - IL < N - 1, DSTEBZ and +*> DSTEIN are called +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA, N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, the lower triangle (if UPLO='L') or the upper +*> triangle (if UPLO='U') of A, including the diagonal, is +*> destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in] VL +*> \verbatim +*> VL is DOUBLE PRECISION +*> If RANGE='V', the lower bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] VU +*> \verbatim +*> VU is DOUBLE PRECISION +*> If RANGE='V', the upper bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] IL +*> \verbatim +*> IL is INTEGER +*> If RANGE='I', the index of the +*> smallest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] IU +*> \verbatim +*> IU is INTEGER +*> If RANGE='I', the index of the +*> largest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION +*> The absolute error tolerance for the eigenvalues. +*> An approximate eigenvalue is accepted as converged +*> when it is determined to lie in an interval [a,b] +*> of width less than or equal to +*> +*> ABSTOL + EPS * max( |a|,|b| ) , +*> +*> where EPS is the machine precision. If ABSTOL is less than +*> or equal to zero, then EPS*|T| will be used in its place, +*> where |T| is the 1-norm of the tridiagonal matrix obtained +*> by reducing A to tridiagonal form. +*> +*> See "Computing Small Singular Values of Bidiagonal Matrices +*> with Guaranteed High Relative Accuracy," by Demmel and +*> Kahan, LAPACK Working Note #3. +*> +*> If high relative accuracy is important, set ABSTOL to +*> DLAMCH( 'Safe minimum' ). Doing so will guarantee that +*> eigenvalues are computed to high relative accuracy when +*> possible in future releases. The current code does not +*> make any guarantees about high relative accuracy, but +*> future releases will. See J. Barlow and J. Demmel, +*> "Computing Accurate Eigensystems of Scaled Diagonally +*> Dominant Matrices", LAPACK Working Note #7, for a discussion +*> of which matrices define their eigenvalues to high relative +*> accuracy. +*> \endverbatim +*> +*> \param[out] M +*> \verbatim +*> M is INTEGER +*> The total number of eigenvalues found. 0 <= M <= N. +*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> The first M elements contain the selected eigenvalues in +*> ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is DOUBLE PRECISION array, dimension (LDZ, max(1,M)) +*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z +*> contain the orthonormal eigenvectors of the matrix A +*> corresponding to the selected eigenvalues, with the i-th +*> column of Z holding the eigenvector associated with W(i). +*> If JOBZ = 'N', then Z is not referenced. +*> Note: the user must ensure that at least max(1,M) columns are +*> supplied in the array Z; if RANGE = 'V', the exact value of M +*> is not known in advance and an upper bound must be used. +*> Supplying N columns is always safe. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] ISUPPZ +*> \verbatim +*> ISUPPZ is INTEGER array, dimension ( 2*max(1,M) ) +*> The support of the eigenvectors in Z, i.e., the indices +*> indicating the nonzero elements in Z. The i-th eigenvector +*> is nonzero only in elements ISUPPZ( 2*i-1 ) through +*> ISUPPZ( 2*i ). This is an output of DSTEMR (tridiagonal +*> matrix). The support of the eigenvectors of A is typically +*> 1:N because of the orthogonal transformations applied by DORMTR. +*> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, 26*N, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + 5*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + 5*N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> +*> If LIWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal size of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: Internal error +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup doubleSYeigen +* +*> \par Contributors: +* ================== +*> +*> Inderjit Dhillon, IBM Almaden, USA \n +*> Osni Marques, LBNL/NERSC, USA \n +*> Ken Stanley, Computer Science Division, University of +*> California at Berkeley, USA \n +*> Jason Riedy, Computer Science Division, University of +*> California at Berkeley, USA \n +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DSYEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, + $ IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, WORK, + $ LWORK, IWORK, LIWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, RANGE, UPLO + INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LWORK, M, N + DOUBLE PRECISION ABSTOL, VL, VU +* .. +* .. Array Arguments .. + INTEGER ISUPPZ( * ), IWORK( * ) + DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, VALEIG, WANTZ, + $ TRYRAC + CHARACTER ORDER + INTEGER I, IEEEOK, IINFO, IMAX, INDD, INDDD, INDE, + $ INDEE, INDIBL, INDIFL, INDISP, INDIWO, INDTAU, + $ INDWK, INDWKN, ISCALE, J, JJ, LIWMIN, + $ LLWORK, LLWRKN, LWMIN, NSPLIT, + $ LHTRD, LWTRD, KD, IB, INDHOUS + DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, + $ SIGMA, SMLNUM, TMP1, VLL, VUU +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANSY + EXTERNAL LSAME, ILAENV, DLAMCH, DLANSY +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DORMTR, DSCAL, DSTEBZ, DSTEMR, DSTEIN, + $ DSTERF, DSWAP, DSYTRD_2STAGE, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IEEEOK = ILAENV( 10, 'DSYEVR', 'N', 1, 2, 3, 4 ) +* + LOWER = LSAME( UPLO, 'L' ) + WANTZ = LSAME( JOBZ, 'V' ) + ALLEIG = LSAME( RANGE, 'A' ) + VALEIG = LSAME( RANGE, 'V' ) + INDEIG = LSAME( RANGE, 'I' ) +* + LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) ) +* + KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) + LIWMIN = MAX( 1, 10*N ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN + INFO = -2 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE + IF( VALEIG ) THEN + IF( N.GT.0 .AND. VU.LE.VL ) + $ INFO = -8 + ELSE IF( INDEIG ) THEN + IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN + INFO = -10 + END IF + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -15 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -18 + ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN + INFO = -20 + END IF + END IF +* + IF( INFO.EQ.0 ) THEN +* NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) +* NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) ) +* LWKOPT = MAX( ( NB+1 )*N, LWMIN ) + WORK( 1 ) = LWMIN + IWORK( 1 ) = LIWMIN + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSYEVR_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + M = 0 + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* + IF( N.EQ.1 ) THEN + WORK( 1 ) = 7 + IF( ALLEIG .OR. INDEIG ) THEN + M = 1 + W( 1 ) = A( 1, 1 ) + ELSE + IF( VL.LT.A( 1, 1 ) .AND. VU.GE.A( 1, 1 ) ) THEN + M = 1 + W( 1 ) = A( 1, 1 ) + END IF + END IF + IF( WANTZ ) THEN + Z( 1, 1 ) = ONE + ISUPPZ( 1 ) = 1 + ISUPPZ( 2 ) = 1 + END IF + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) +* +* Scale matrix to allowable range, if necessary. +* + ISCALE = 0 + ABSTLL = ABSTOL + IF (VALEIG) THEN + VLL = VL + VUU = VU + END IF + ANRM = DLANSY( 'M', UPLO, N, A, LDA, WORK ) + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + DO 10 J = 1, N + CALL DSCAL( N-J+1, SIGMA, A( J, J ), 1 ) + 10 CONTINUE + ELSE + DO 20 J = 1, N + CALL DSCAL( J, SIGMA, A( 1, J ), 1 ) + 20 CONTINUE + END IF + IF( ABSTOL.GT.0 ) + $ ABSTLL = ABSTOL*SIGMA + IF( VALEIG ) THEN + VLL = VL*SIGMA + VUU = VU*SIGMA + END IF + END IF + +* Initialize indices into workspaces. Note: The IWORK indices are +* used only if DSTERF or DSTEMR fail. + +* WORK(INDTAU:INDTAU+N-1) stores the scalar factors of the +* elementary reflectors used in DSYTRD. + INDTAU = 1 +* WORK(INDD:INDD+N-1) stores the tridiagonal's diagonal entries. + INDD = INDTAU + N +* WORK(INDE:INDE+N-1) stores the off-diagonal entries of the +* tridiagonal matrix from DSYTRD. + INDE = INDD + N +* WORK(INDDD:INDDD+N-1) is a copy of the diagonal entries over +* -written by DSTEMR (the DSTERF path copies the diagonal to W). + INDDD = INDE + N +* WORK(INDEE:INDEE+N-1) is a copy of the off-diagonal entries over +* -written while computing the eigenvalues in DSTERF and DSTEMR. + INDEE = INDDD + N +* INDHOUS is the starting offset Householder storage of stage 2 + INDHOUS = INDEE + N +* INDWK is the starting offset of the left-over workspace, and +* LLWORK is the remaining workspace size. + INDWK = INDHOUS + LHTRD + LLWORK = LWORK - INDWK + 1 + + +* IWORK(INDIBL:INDIBL+M-1) corresponds to IBLOCK in DSTEBZ and +* stores the block indices of each of the M<=N eigenvalues. + INDIBL = 1 +* IWORK(INDISP:INDISP+NSPLIT-1) corresponds to ISPLIT in DSTEBZ and +* stores the starting and finishing indices of each block. + INDISP = INDIBL + N +* IWORK(INDIFL:INDIFL+N-1) stores the indices of eigenvectors +* that corresponding to eigenvectors that fail to converge in +* DSTEIN. This information is discarded; if any fail, the driver +* returns INFO > 0. + INDIFL = INDISP + N +* INDIWO is the offset of the remaining integer workspace. + INDIWO = INDIFL + N + +* +* Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. +* +* + CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), + $ LHTRD, WORK( INDWK ), LLWORK, IINFO ) +* +* If all eigenvalues are desired +* then call DSTERF or DSTEMR and DORMTR. +* + IF( ( ALLEIG .OR. ( INDEIG .AND. IL.EQ.1 .AND. IU.EQ.N ) ) .AND. + $ IEEEOK.EQ.1 ) THEN + IF( .NOT.WANTZ ) THEN + CALL DCOPY( N, WORK( INDD ), 1, W, 1 ) + CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 ) + CALL DSTERF( N, W, WORK( INDEE ), INFO ) + ELSE + CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 ) + CALL DCOPY( N, WORK( INDD ), 1, WORK( INDDD ), 1 ) +* + IF (ABSTOL .LE. TWO*N*EPS) THEN + TRYRAC = .TRUE. + ELSE + TRYRAC = .FALSE. + END IF + CALL DSTEMR( JOBZ, 'A', N, WORK( INDDD ), WORK( INDEE ), + $ VL, VU, IL, IU, M, W, Z, LDZ, N, ISUPPZ, + $ TRYRAC, WORK( INDWK ), LWORK, IWORK, LIWORK, + $ INFO ) +* +* +* +* Apply orthogonal matrix used in reduction to tridiagonal +* form to eigenvectors returned by DSTEMR. +* + IF( WANTZ .AND. INFO.EQ.0 ) THEN + INDWKN = INDE + LLWRKN = LWORK - INDWKN + 1 + CALL DORMTR( 'L', UPLO, 'N', N, M, A, LDA, + $ WORK( INDTAU ), Z, LDZ, WORK( INDWKN ), + $ LLWRKN, IINFO ) + END IF + END IF +* +* + IF( INFO.EQ.0 ) THEN +* Everything worked. Skip DSTEBZ/DSTEIN. IWORK(:) are +* undefined. + M = N + GO TO 30 + END IF + INFO = 0 + END IF +* +* Otherwise, call DSTEBZ and, if eigenvectors are desired, DSTEIN. +* Also call DSTEBZ and DSTEIN if DSTEMR fails. +* + IF( WANTZ ) THEN + ORDER = 'B' + ELSE + ORDER = 'E' + END IF + + CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, + $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W, + $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWK ), + $ IWORK( INDIWO ), INFO ) +* + IF( WANTZ ) THEN + CALL DSTEIN( N, WORK( INDD ), WORK( INDE ), M, W, + $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ WORK( INDWK ), IWORK( INDIWO ), IWORK( INDIFL ), + $ INFO ) +* +* Apply orthogonal matrix used in reduction to tridiagonal +* form to eigenvectors returned by DSTEIN. +* + INDWKN = INDE + LLWRKN = LWORK - INDWKN + 1 + CALL DORMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z, + $ LDZ, WORK( INDWKN ), LLWRKN, IINFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* +* Jump here if DSTEMR/DSTEIN succeeded. + 30 CONTINUE + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = M + ELSE + IMAX = INFO - 1 + END IF + CALL DSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* If eigenvalues are not in order, then sort them, along with +* eigenvectors. Note: We do not sort the IFAIL portion of IWORK. +* It may not be initialized (if DSTEMR/DSTEIN succeeded), and we do +* not return this detailed information to the user. +* + IF( WANTZ ) THEN + DO 50 J = 1, M - 1 + I = 0 + TMP1 = W( J ) + DO 40 JJ = J + 1, M + IF( W( JJ ).LT.TMP1 ) THEN + I = JJ + TMP1 = W( JJ ) + END IF + 40 CONTINUE +* + IF( I.NE.0 ) THEN + W( I ) = W( J ) + W( J ) = TMP1 + CALL DSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) + END IF + 50 CONTINUE + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN + IWORK( 1 ) = LIWMIN +* + RETURN +* +* End of DSYEVR_2STAGE +* + END diff --git a/SRC/dsyevx_2stage.f b/SRC/dsyevx_2stage.f new file mode 100644 index 00000000..2c52e9e3 --- /dev/null +++ b/SRC/dsyevx_2stage.f @@ -0,0 +1,608 @@ +*> \brief <b> DSYEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b> +* +* @precisions fortran d -> s +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DSYEVX_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsyevx_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsyevx_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsyevx_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DSYEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, +* IL, IU, ABSTOL, M, W, Z, LDZ, WORK, +* LWORK, IWORK, IFAIL, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, RANGE, UPLO +* INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N +* DOUBLE PRECISION ABSTOL, VL, VU +* .. +* .. Array Arguments .. +* INTEGER IFAIL( * ), IWORK( * ) +* DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSYEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors +*> of a real symmetric matrix A using the 2stage technique for +*> the reduction to tridiagonal. Eigenvalues and eigenvectors can be +*> selected by specifying either a range of values or a range of indices +*> for the desired eigenvalues. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] RANGE +*> \verbatim +*> RANGE is CHARACTER*1 +*> = 'A': all eigenvalues will be found. +*> = 'V': all eigenvalues in the half-open interval (VL,VU] +*> will be found. +*> = 'I': the IL-th through IU-th eigenvalues will be found. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA, N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, the lower triangle (if UPLO='L') or the upper +*> triangle (if UPLO='U') of A, including the diagonal, is +*> destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in] VL +*> \verbatim +*> VL is DOUBLE PRECISION +*> If RANGE='V', the lower bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] VU +*> \verbatim +*> VU is DOUBLE PRECISION +*> If RANGE='V', the upper bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] IL +*> \verbatim +*> IL is INTEGER +*> If RANGE='I', the index of the +*> smallest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] IU +*> \verbatim +*> IU is INTEGER +*> If RANGE='I', the index of the +*> largest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION +*> The absolute error tolerance for the eigenvalues. +*> An approximate eigenvalue is accepted as converged +*> when it is determined to lie in an interval [a,b] +*> of width less than or equal to +*> +*> ABSTOL + EPS * max( |a|,|b| ) , +*> +*> where EPS is the machine precision. If ABSTOL is less than +*> or equal to zero, then EPS*|T| will be used in its place, +*> where |T| is the 1-norm of the tridiagonal matrix obtained +*> by reducing A to tridiagonal form. +*> +*> Eigenvalues will be computed most accurately when ABSTOL is +*> set to twice the underflow threshold 2*DLAMCH('S'), not zero. +*> If this routine returns with INFO>0, indicating that some +*> eigenvectors did not converge, try setting ABSTOL to +*> 2*DLAMCH('S'). +*> +*> See "Computing Small Singular Values of Bidiagonal Matrices +*> with Guaranteed High Relative Accuracy," by Demmel and +*> Kahan, LAPACK Working Note #3. +*> \endverbatim +*> +*> \param[out] M +*> \verbatim +*> M is INTEGER +*> The total number of eigenvalues found. 0 <= M <= N. +*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> On normal exit, the first M elements contain the selected +*> eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is DOUBLE PRECISION array, dimension (LDZ, max(1,M)) +*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z +*> contain the orthonormal eigenvectors of the matrix A +*> corresponding to the selected eigenvalues, with the i-th +*> column of Z holding the eigenvector associated with W(i). +*> If an eigenvector fails to converge, then that column of Z +*> contains the latest approximation to the eigenvector, and the +*> index of the eigenvector is returned in IFAIL. +*> If JOBZ = 'N', then Z is not referenced. +*> Note: the user must ensure that at least max(1,M) columns are +*> supplied in the array Z; if RANGE = 'V', the exact value of M +*> is not known in advance and an upper bound must be used. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, 8*N, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + 3*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + 3*N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (5*N) +*> \endverbatim +*> +*> \param[out] IFAIL +*> \verbatim +*> IFAIL is INTEGER array, dimension (N) +*> If JOBZ = 'V', then if INFO = 0, the first M elements of +*> IFAIL are zero. If INFO > 0, then IFAIL contains the +*> indices of the eigenvectors that failed to converge. +*> If JOBZ = 'N', then IFAIL is not referenced. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, then i eigenvectors failed to converge. +*> Their indices are stored in array IFAIL. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup doubleSYeigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DSYEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, + $ IL, IU, ABSTOL, M, W, Z, LDZ, WORK, + $ LWORK, IWORK, IFAIL, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, RANGE, UPLO + INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N + DOUBLE PRECISION ABSTOL, VL, VU +* .. +* .. Array Arguments .. + INTEGER IFAIL( * ), IWORK( * ) + DOUBLE PRECISION A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG, + $ WANTZ + CHARACTER ORDER + INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, + $ INDISP, INDIWO, INDTAU, INDWKN, INDWRK, ISCALE, + $ ITMP1, J, JJ, LLWORK, LLWRKN, + $ NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS + DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, + $ SIGMA, SMLNUM, TMP1, VLL, VUU +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANSY + EXTERNAL LSAME, ILAENV, DLAMCH, DLANSY +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DLACPY, DORGTR, DORMTR, DSCAL, DSTEBZ, + $ DSTEIN, DSTEQR, DSTERF, DSWAP, XERBLA, + $ DSYTRD_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LOWER = LSAME( UPLO, 'L' ) + WANTZ = LSAME( JOBZ, 'V' ) + ALLEIG = LSAME( RANGE, 'A' ) + VALEIG = LSAME( RANGE, 'V' ) + INDEIG = LSAME( RANGE, 'I' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN + INFO = -2 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE + IF( VALEIG ) THEN + IF( N.GT.0 .AND. VU.LE.VL ) + $ INFO = -8 + ELSE IF( INDEIG ) THEN + IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN + INFO = -10 + END IF + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -15 + END IF + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + WORK( 1 ) = LWMIN + ELSE + KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = MAX( 8*N, 3*N + LHTRD + LWTRD ) + WORK( 1 ) = LWMIN + END IF +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -17 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSYEVX_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + M = 0 + IF( N.EQ.0 ) THEN + RETURN + END IF +* + IF( N.EQ.1 ) THEN + IF( ALLEIG .OR. INDEIG ) THEN + M = 1 + W( 1 ) = A( 1, 1 ) + ELSE + IF( VL.LT.A( 1, 1 ) .AND. VU.GE.A( 1, 1 ) ) THEN + M = 1 + W( 1 ) = A( 1, 1 ) + END IF + END IF + IF( WANTZ ) + $ Z( 1, 1 ) = ONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) +* +* Scale matrix to allowable range, if necessary. +* + ISCALE = 0 + ABSTLL = ABSTOL + IF( VALEIG ) THEN + VLL = VL + VUU = VU + END IF + ANRM = DLANSY( 'M', UPLO, N, A, LDA, WORK ) + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + DO 10 J = 1, N + CALL DSCAL( N-J+1, SIGMA, A( J, J ), 1 ) + 10 CONTINUE + ELSE + DO 20 J = 1, N + CALL DSCAL( J, SIGMA, A( 1, J ), 1 ) + 20 CONTINUE + END IF + IF( ABSTOL.GT.0 ) + $ ABSTLL = ABSTOL*SIGMA + IF( VALEIG ) THEN + VLL = VL*SIGMA + VUU = VU*SIGMA + END IF + END IF +* +* Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. +* + INDTAU = 1 + INDE = INDTAU + N + INDD = INDE + N + INDHOUS = INDD + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), + $ LHTRD, WORK( INDWRK ), LLWORK, IINFO ) +* +* If all eigenvalues are desired and ABSTOL is less than or equal to +* zero, then call DSTERF or DORGTR and SSTEQR. If this fails for +* some eigenvalue, then try DSTEBZ. +* + TEST = .FALSE. + IF( INDEIG ) THEN + IF( IL.EQ.1 .AND. IU.EQ.N ) THEN + TEST = .TRUE. + END IF + END IF + IF( ( ALLEIG .OR. TEST ) .AND. ( ABSTOL.LE.ZERO ) ) THEN + CALL DCOPY( N, WORK( INDD ), 1, W, 1 ) + INDEE = INDWRK + 2*N + IF( .NOT.WANTZ ) THEN + CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 ) + CALL DSTERF( N, W, WORK( INDEE ), INFO ) + ELSE + CALL DLACPY( 'A', N, N, A, LDA, Z, LDZ ) + CALL DORGTR( UPLO, N, Z, LDZ, WORK( INDTAU ), + $ WORK( INDWRK ), LLWORK, IINFO ) + CALL DCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 ) + CALL DSTEQR( JOBZ, N, W, WORK( INDEE ), Z, LDZ, + $ WORK( INDWRK ), INFO ) + IF( INFO.EQ.0 ) THEN + DO 30 I = 1, N + IFAIL( I ) = 0 + 30 CONTINUE + END IF + END IF + IF( INFO.EQ.0 ) THEN + M = N + GO TO 40 + END IF + INFO = 0 + END IF +* +* Otherwise, call DSTEBZ and, if eigenvectors are desired, SSTEIN. +* + IF( WANTZ ) THEN + ORDER = 'B' + ELSE + ORDER = 'E' + END IF + INDIBL = 1 + INDISP = INDIBL + N + INDIWO = INDISP + N + CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, + $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W, + $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ), + $ IWORK( INDIWO ), INFO ) +* + IF( WANTZ ) THEN + CALL DSTEIN( N, WORK( INDD ), WORK( INDE ), M, W, + $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO ) +* +* Apply orthogonal matrix used in reduction to tridiagonal +* form to eigenvectors returned by DSTEIN. +* + INDWKN = INDE + LLWRKN = LWORK - INDWKN + 1 + CALL DORMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z, + $ LDZ, WORK( INDWKN ), LLWRKN, IINFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + 40 CONTINUE + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = M + ELSE + IMAX = INFO - 1 + END IF + CALL DSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* If eigenvalues are not in order, then sort them, along with +* eigenvectors. +* + IF( WANTZ ) THEN + DO 60 J = 1, M - 1 + I = 0 + TMP1 = W( J ) + DO 50 JJ = J + 1, M + IF( W( JJ ).LT.TMP1 ) THEN + I = JJ + TMP1 = W( JJ ) + END IF + 50 CONTINUE +* + IF( I.NE.0 ) THEN + ITMP1 = IWORK( INDIBL+I-1 ) + W( I ) = W( J ) + IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + W( J ) = TMP1 + IWORK( INDIBL+J-1 ) = ITMP1 + CALL DSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) + IF( INFO.NE.0 ) THEN + ITMP1 = IFAIL( I ) + IFAIL( I ) = IFAIL( J ) + IFAIL( J ) = ITMP1 + END IF + END IF + 60 CONTINUE + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of DSYEVX_2STAGE +* + END diff --git a/SRC/dsygv_2stage.f b/SRC/dsygv_2stage.f new file mode 100644 index 00000000..2c79ec8a --- /dev/null +++ b/SRC/dsygv_2stage.f @@ -0,0 +1,370 @@ +*> \brief \b DSYGV_2STAGE +* +* @precisions fortran d -> s +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DSYGV_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsygv_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsygv_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsygv_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DSYGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, +* WORK, LWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, ITYPE, LDA, LDB, LWORK, N +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), W( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSYGV_2STAGE computes all the eigenvalues, and optionally, the eigenvectors +*> of a real generalized symmetric-definite eigenproblem, of the form +*> A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. +*> Here A and B are assumed to be symmetric and B is also +*> positive definite. +*> This routine use the 2stage technique for the reduction to tridiagonal +*> which showed higher performance on recent architecture and for large +* sizes N>2000. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] ITYPE +*> \verbatim +*> ITYPE is INTEGER +*> Specifies the problem type to be solved: +*> = 1: A*x = (lambda)*B*x +*> = 2: A*B*x = (lambda)*x +*> = 3: B*A*x = (lambda)*x +*> \endverbatim +*> +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangles of A and B are stored; +*> = 'L': Lower triangles of A and B are stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrices A and B. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA, N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> +*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the +*> matrix Z of eigenvectors. The eigenvectors are normalized +*> as follows: +*> if ITYPE = 1 or 2, Z**T*B*Z = I; +*> if ITYPE = 3, Z**T*inv(B)*Z = I. +*> If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') +*> or the lower triangle (if UPLO='L') of A, including the +*> diagonal, is destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB, N) +*> On entry, the symmetric positive definite matrix B. +*> If UPLO = 'U', the leading N-by-N upper triangular part of B +*> contains the upper triangular part of the matrix B. +*> If UPLO = 'L', the leading N-by-N lower triangular part of B +*> contains the lower triangular part of the matrix B. +*> +*> On exit, if INFO <= N, the part of B containing the matrix is +*> overwritten by the triangular factor U or L from the Cholesky +*> factorization B = U**T*U or B = L*L**T. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + 2*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + 2*N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: DPOTRF or DSYEV returned an error code: +*> <= N: if INFO = i, DSYEV failed to converge; +*> i off-diagonal elements of an intermediate +*> tridiagonal form did not converge to zero; +*> > N: if INFO = N + i, for 1 <= i <= N, then the leading +*> minor of order i of B is not positive definite. +*> The factorization of B could not be completed and +*> no eigenvalues or eigenvectors were computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup doubleSYeigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DSYGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, + $ WORK, LWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, ITYPE, LDA, LDB, LWORK, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), W( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, UPPER, WANTZ + CHARACTER TRANS + INTEGER NEIG, LWMIN, LHTRD, LWTRD, KD, IB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL DPOTRF, DSYGST, DTRMM, DTRSM, XERBLA, + $ DSYEV_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( ITYPE.LT.1 .OR. ITYPE.GT.3 ) THEN + INFO = -1 + ELSE IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -2 + ELSE IF( .NOT.( UPPER .OR. LSAME( UPLO, 'L' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -8 + END IF +* + IF( INFO.EQ.0 ) THEN + KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = 2*N + LHTRD + LWTRD + WORK( 1 ) = LWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -11 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSYGV_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Form a Cholesky factorization of B. +* + CALL DPOTRF( UPLO, N, B, LDB, INFO ) + IF( INFO.NE.0 ) THEN + INFO = N + INFO + RETURN + END IF +* +* Transform problem to standard eigenvalue problem and solve. +* + CALL DSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) + CALL DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, INFO ) +* + IF( WANTZ ) THEN +* +* Backtransform eigenvectors to the original problem. +* + NEIG = N + IF( INFO.GT.0 ) + $ NEIG = INFO - 1 + IF( ITYPE.EQ.1 .OR. ITYPE.EQ.2 ) THEN +* +* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; +* backtransform eigenvectors: x = inv(L)**T*y or inv(U)*y +* + IF( UPPER ) THEN + TRANS = 'N' + ELSE + TRANS = 'T' + END IF +* + CALL DTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, + $ B, LDB, A, LDA ) +* + ELSE IF( ITYPE.EQ.3 ) THEN +* +* For B*A*x=(lambda)*x; +* backtransform eigenvectors: x = L*y or U**T*y +* + IF( UPPER ) THEN + TRANS = 'T' + ELSE + TRANS = 'N' + END IF +* + CALL DTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, + $ B, LDB, A, LDA ) + END IF + END IF +* + WORK( 1 ) = LWMIN + RETURN +* +* End of DSYGV_2STAGE +* + END diff --git a/SRC/dsytrd_2stage.f b/SRC/dsytrd_2stage.f new file mode 100644 index 00000000..449a279e --- /dev/null +++ b/SRC/dsytrd_2stage.f @@ -0,0 +1,337 @@ +*> \brief \b DSYTRD_2STAGE +* +* @generated from zhetrd_2stage.f, fortran z -> d, Sun Nov 6 19:34:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DSYTRD_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsytrd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsytrd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsytrd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* HOUS2, LHOUS2, WORK, LWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER VECT, UPLO +* INTEGER N, LDA, LWORK, LHOUS2, INFO +* .. +* .. Array Arguments .. +* DOUBLE PRECISION D( * ), E( * ) +* DOUBLE PRECISION A( LDA, * ), TAU( * ), +* HOUS2( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSYTRD_2STAGE reduces a real symmetric matrix A to real symmetric +*> tridiagonal form T by a orthogonal similarity transformation: +*> Q1**T Q2**T* A * Q2 * Q1 = T. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] VECT +*> \verbatim +*> VECT is CHARACTER*1 +*> = 'N': No need for the Housholder representation, +*> in particular for the second stage (Band to +*> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, +*> then LHOUS2 is to be queried and computed. +*> (NOT AVAILABLE IN THIS RELEASE). +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the leading +*> N-by-N upper triangular part of A contains the upper +*> triangular part of the matrix A, and the strictly lower +*> triangular part of A is not referenced. If UPLO = 'L', the +*> leading N-by-N lower triangular part of A contains the lower +*> triangular part of the matrix A, and the strictly upper +*> triangular part of A is not referenced. +*> On exit, if UPLO = 'U', the band superdiagonal +*> of A are overwritten by the corresponding elements of the +*> internal band-diagonal matrix AB, and the elements above +*> the KD superdiagonal, with the array TAU, represent the orthogonal +*> matrix Q1 as a product of elementary reflectors; if UPLO +*> = 'L', the diagonal and band subdiagonal of A are over- +*> written by the corresponding elements of the internal band-diagonal +*> matrix AB, and the elements below the KD subdiagonal, with +*> the array TAU, represent the orthogonal matrix Q1 as a product +*> of elementary reflectors. See Further Details. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is DOUBLE PRECISION array, dimension (N) +*> The diagonal elements of the tridiagonal matrix T. +*> \endverbatim +*> +*> \param[out] E +*> \verbatim +*> E is DOUBLE PRECISION array, dimension (N-1) +*> The off-diagonal elements of the tridiagonal matrix T. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (N-KD) +*> The scalar factors of the elementary reflectors of +*> the first stage (see Further Details). +*> \endverbatim +*> +*> \param[out] HOUS2 +*> \verbatim +*> HOUS2 is DOUBLE PRECISION array, dimension LHOUS2, that +*> store the Householder representation of the stage2 +*> band to tridiagonal. +*> \endverbatim +*> +*> \param[in] LHOUS2 +*> \verbatim +*> LHOUS2 is INTEGER +*> The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension) +*> If LWORK = -1, or LHOUS2=-1, +*> then a query is assumed; the routine +*> only calculates the optimal size of the HOUS2 array, returns +*> this value as the first entry of the HOUS2 array, and no error +*> message related to LHOUS2 is issued by XERBLA. +*> LHOUS2 = MAX(1, dimension) where +*> dimension = 4*N if VECT='N' +*> not available now if VECT='H' +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> If LWORK = -1, or LHOUS2=-1, +*> then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup doubleSYcomputational +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + $ HOUS2, LHOUS2, WORK, LWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER VECT, UPLO + INTEGER N, LDA, LWORK, LHOUS2, INFO +* .. +* .. Array Arguments .. + DOUBLE PRECISION D( * ), E( * ) + DOUBLE PRECISION A( LDA, * ), TAU( * ), + $ HOUS2( * ), WORK( * ) +* .. +* +* ===================================================================== +* .. +* .. Local Scalars .. + LOGICAL LQUERY, UPPER, WANTQ + INTEGER KD, IB, LWMIN, LHMIN, LWRK, LDAB, WPOS, ABPOS +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, DSYTRD_SY2SB, DSYTRD_SB2ST +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + WANTQ = LSAME( VECT, 'V' ) + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS2.EQ.-1 ) +* +* Determine the block size, the workspace size and the hous size. +* + KD = ILAENV( 17, 'DSYTRD_2STAGE', VECT, N, -1, -1, -1 ) + IB = ILAENV( 18, 'DSYTRD_2STAGE', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV( 19, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV( 20, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) +* WRITE(*,*),'DSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, +* $ LHMIN, LWMIN +* + IF( .NOT.LSAME( VECT, 'N' ) ) THEN + INFO = -1 + ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LHOUS2.LT.LHMIN .AND. .NOT.LQUERY ) THEN + INFO = -10 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -12 + END IF +* + IF( INFO.EQ.0 ) THEN + HOUS2( 1 ) = LHMIN + WORK( 1 ) = LWMIN + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSYTRD_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* +* Determine pointer position +* + LDAB = KD+1 + LWRK = LWORK-LDAB*N + ABPOS = 1 + WPOS = ABPOS + LDAB*N + CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + $ TAU, WORK( WPOS ), LWRK, INFO ) + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSYTRD_SY2SB', -INFO ) + RETURN + END IF + CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, + $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSYTRD_SB2ST', -INFO ) + RETURN + END IF +* +* + HOUS2( 1 ) = LHMIN + WORK( 1 ) = LWMIN + RETURN +* +* End of DSYTRD_2STAGE +* + END diff --git a/SRC/dsytrd_sb2st.F b/SRC/dsytrd_sb2st.F new file mode 100644 index 00000000..d50debe1 --- /dev/null +++ b/SRC/dsytrd_sb2st.F @@ -0,0 +1,603 @@ +*> \brief \b DSBTRD +* +* @generated from zhetrd_hb2st.F, fortran z -> d, Sun Nov 6 19:34:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DSBTRD + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsbtrd.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsbtrd.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsbtrd.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* D, E, HOUS, LHOUS, WORK, LWORK, INFO ) +* +* #define PRECISION_REAL +* +* #if defined(_OPENMP) +* use omp_lib +* #endif +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER STAGE1, UPLO, VECT +* INTEGER N, KD, IB, LDAB, LHOUS, LWORK, INFO +* .. +* .. Array Arguments .. +* DOUBLE PRECISION D( * ), E( * ) +* DOUBLE PRECISION AB( LDAB, * ), HOUS( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSBTRD reduces a real symmetric band matrix A to real symmetric +*> tridiagonal form T by a orthogonal similarity transformation: +*> Q**T * A * Q = T. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] STAGE +*> \verbatim +*> STAGE is CHARACTER*1 +*> = 'N': "No": to mention that the stage 1 of the reduction +*> from dense to band using the dsytrd_sy2sb routine +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the dsytrd_sy2sb +*> routine has been called to produce AB (e.g., AB is +*> the output of dsytrd_sy2sb. +*> \endverbatim +*> +*> \param[in] VECT +*> \verbatim +*> VECT is CHARACTER*1 +*> = 'N': No need for the Housholder representation, +*> and thus LHOUS is of size max(1, 4*N); +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, +*> then LHOUS is to be queried and computed. +*> (NOT AVAILABLE IN THIS RELEASE). +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is DOUBLE PRECISION array, dimension (LDAB,N) +*> On entry, the upper or lower triangle of the symmetric band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> On exit, the diagonal elements of AB are overwritten by the +*> diagonal elements of the tridiagonal matrix T; if KD > 0, the +*> elements on the first superdiagonal (if UPLO = 'U') or the +*> first subdiagonal (if UPLO = 'L') are overwritten by the +*> off-diagonal elements of T; the rest of AB is overwritten by +*> values generated during the reduction. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD+1. +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is DOUBLE PRECISION array, dimension (N) +*> The diagonal elements of the tridiagonal matrix T. +*> \endverbatim +*> +*> \param[out] E +*> \verbatim +*> E is DOUBLE PRECISION array, dimension (N-1) +*> The off-diagonal elements of the tridiagonal matrix T: +*> E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'. +*> \endverbatim +*> +*> \param[out] HOUS +*> \verbatim +*> HOUS is DOUBLE PRECISION array, dimension LHOUS, that +*> store the Householder representation. +*> \endverbatim +*> +*> \param[in] LHOUS +*> \verbatim +*> LHOUS is INTEGER +*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) +*> If LWORK = -1, or LHOUS=-1, +*> then a query is assumed; the routine +*> only calculates the optimal size of the HOUS array, returns +*> this value as the first entry of the HOUS array, and no error +*> message related to LHOUS is issued by XERBLA. +*> LHOUS = MAX(1, dimension) where +*> dimension = 4*N if VECT='N' +*> not available now if VECT='H' +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> If LWORK = -1, or LHOUS=-1, +*> then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup real16OTHERcomputational +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) +* +#define PRECISION_REAL +* +#if defined(_OPENMP) + use omp_lib +#endif +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER STAGE1, UPLO, VECT + INTEGER N, KD, LDAB, LHOUS, LWORK, INFO +* .. +* .. Array Arguments .. + DOUBLE PRECISION D( * ), E( * ) + DOUBLE PRECISION AB( LDAB, * ), HOUS( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION RZERO + DOUBLE PRECISION ZERO, ONE + PARAMETER ( RZERO = 0.0D+0, + $ ZERO = 0.0D+0, + $ ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, + $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, + $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, + $ SIDEV, SIZETAU, LDV, LHMIN, LWMIN +#if defined (PRECISION_COMPLEX) + DOUBLE PRECISION ABSTMP + DOUBLE PRECISION TMP +#endif +* .. +* .. External Subroutines .. + EXTERNAL DSB2ST_KERNELS, DLACPY, DLASET +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN, MAX, CEILING, REAL +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. Executable Statements .. +* +* Determine the minimal workspace size required. +* Test the input parameters +* + DEBUG = 0 + INFO = 0 + AFTERS1 = LSAME( STAGE1, 'Y' ) + WANTQ = LSAME( VECT, 'V' ) + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 ) +* +* Determine the block size, the workspace size and the hous size. +* + IB = ILAENV( 18, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV( 19, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV( 20, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) +* + IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN + INFO = -1 + ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN + INFO = -2 + ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( KD.LT.0 ) THEN + INFO = -5 + ELSE IF( LDAB.LT.(KD+1) ) THEN + INFO = -7 + ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -13 + END IF +* + IF( INFO.EQ.0 ) THEN + HOUS( 1 ) = LHMIN + WORK( 1 ) = LWMIN + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSYTRD_SB2ST', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* +* Determine pointer position +* + LDV = KD + IB + SIZETAU = 2 * N + SIDEV = 2 * N + INDTAU = 1 + INDV = INDTAU + SIZETAU + LDA = 2 * KD + 1 + SIZEA = LDA * N + INDA = 1 + INDW = INDA + SIZEA + NTHREADS = 1 + TID = 0 +* + IF( UPPER ) THEN + APOS = INDA + KD + AWPOS = INDA + DPOS = APOS + KD + OFDPOS = DPOS - 1 + ABDPOS = KD + 1 + ABOFDPOS = KD + ELSE + APOS = INDA + AWPOS = INDA + KD + 1 + DPOS = APOS + OFDPOS = DPOS + 1 + ABDPOS = 1 + ABOFDPOS = 2 + + ENDIF +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* real because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or +* in a parallel environment it might need two cores for D and E +* + IF( KD.EQ.0 ) THEN + DO 30 I = 1, N + D( I ) = ( AB( ABDPOS, I ) ) + 30 CONTINUE + DO 40 I = 1, N-1 + E( I ) = RZERO + 40 CONTINUE + GOTO 200 + END IF +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal +* and offdiagonal elements real, and store them in D and E. +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler coversion formula might be used, but then +* updating the Q matrix will be required and based if Q is generated +* or not this might complicate the story. +* +C IF( KD.EQ.1 .AND. N.GT.(KD+1) .AND. AFTERS1 ) THEN + IF( KD.EQ.1 ) THEN + DO 50 I = 1, N + D( I ) = ( AB( ABDPOS, I ) ) + 50 CONTINUE +#if defined (PRECISION_COMPLEX) +* +* make off-diagonal elements real and copy them to E +* + IF( UPPER ) THEN + DO 60 I = 1, N - 1 + TMP = AB( ABOFDPOS, I+1 ) + ABSTMP = ABS( TMP ) + AB( ABOFDPOS, I+1 ) = ABSTMP + E( I ) = ABSTMP + IF( ABSTMP.NE.RZERO ) THEN + TMP = TMP / ABSTMP + ELSE + TMP = ONE + END IF + IF( I.LT.N-1 ) + $ AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP +C IF( WANTZ ) THEN +C CALL DSCAL( N, ( TMP ), Q( 1, I+1 ), 1 ) +C END IF + 60 CONTINUE + ELSE + DO 70 I = 1, N - 1 + TMP = AB( ABOFDPOS, I ) + ABSTMP = ABS( TMP ) + AB( ABOFDPOS, I ) = ABSTMP + E( I ) = ABSTMP + IF( ABSTMP.NE.RZERO ) THEN + TMP = TMP / ABSTMP + ELSE + TMP = ONE + END IF + IF( I.LT.N-1 ) + $ AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP +C IF( WANTQ ) THEN +C CALL DSCAL( N, TMP, Q( 1, I+1 ), 1 ) +C END IF + 70 CONTINUE + ENDIF +#else + IF( UPPER ) THEN + DO 60 I = 1, N-1 + E( I ) = ( AB( ABOFDPOS, I+1 ) ) + 60 CONTINUE + ELSE + DO 70 I = 1, N-1 + E( I ) = ( AB( ABOFDPOS, I ) ) + 70 CONTINUE + ENDIF +#endif + GOTO 200 + END IF +* +* Main code start here. +* Reduce the symmetric band of A to a tridiagonal matrix. +* + THGRSIZ = N + GRSIZ = 1 + SHIFT = 3 + NBTILES = CEILING( REAL(N)/REAL(KD) ) + STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) + THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) +* + CALL DLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) + CALL DLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) +* +* +* openMP parallelisation start here +* +#if defined(_OPENMP) +!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) +!$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) +!$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) +!$OMP$ SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT ) +!$OMP MASTER +#endif +* +* main bulge chasing loop +* + DO 100 THGRID = 1, THGRNB + STT = (THGRID-1)*THGRSIZ+1 + THED = MIN( (STT + THGRSIZ -1), (N-1)) + DO 110 I = STT, N-1 + ED = MIN( I, THED ) + IF( STT.GT.ED ) GOTO 100 + DO 120 M = 1, STEPERCOL + ST = STT + DO 130 SWEEPID = ST, ED + DO 140 K = 1, GRSIZ + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + $ + (M-1)*GRSIZ + K + IF ( MYID.EQ.1 ) THEN + TTYPE = 1 + ELSE + TTYPE = MOD( MYID, 2 ) + 2 + ENDIF + + IF( TTYPE.EQ.2 ) THEN + COLPT = (MYID/2)*KD + SWEEPID + STIND = COLPT-KD+1 + EDIND = MIN(COLPT,N) + BLKLASTIND = COLPT + ELSE + COLPT = ((MYID+1)/2)*KD + SWEEPID + STIND = COLPT-KD+1 + EDIND = MIN(COLPT,N) + IF( ( STIND.GE.EDIND-1 ).AND. + $ ( EDIND.EQ.N ) ) THEN + BLKLASTIND = N + ELSE + BLKLASTIND = 0 + ENDIF + ENDIF +* +* Call the kernel +* +#if defined(_OPENMP) + IF( TTYPE.NE.1 ) THEN +!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) +!$OMP$ DEPEND(in:WORK(MYID-1)) +!$OMP$ DEPEND(out:WORK(MYID)) + TID = OMP_GET_THREAD_NUM() + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + $ STIND, EDIND, SWEEPID, N, KD, IB, + $ WORK ( INDA ), LDA, + $ HOUS( INDV ), HOUS( INDTAU ), LDV, + $ WORK( INDW + TID*KD ) ) +!$OMP END TASK + ELSE +!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) +!$OMP$ DEPEND(out:WORK(MYID)) + TID = OMP_GET_THREAD_NUM() + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + $ STIND, EDIND, SWEEPID, N, KD, IB, + $ WORK ( INDA ), LDA, + $ HOUS( INDV ), HOUS( INDTAU ), LDV, + $ WORK( INDW + TID*KD ) ) +!$OMP END TASK + ENDIF +#else + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + $ STIND, EDIND, SWEEPID, N, KD, IB, + $ WORK ( INDA ), LDA, + $ HOUS( INDV ), HOUS( INDTAU ), LDV, + $ WORK( INDW + TID*KD ) ) +#endif + IF ( BLKLASTIND.GE.(N-1) ) THEN + STT = STT + 1 + GOTO 130 + ENDIF + 140 CONTINUE + 130 CONTINUE + 120 CONTINUE + 110 CONTINUE + 100 CONTINUE +* +#if defined(_OPENMP) +!$OMP END MASTER +!$OMP END PARALLEL +#endif +* +* Copy the diagonal from A to D. Note that D is REAL thus only +* the Real part is needed, the imaginary part should be zero. +* + DO 150 I = 1, N + D( I ) = ( WORK( DPOS+(I-1)*LDA ) ) + 150 CONTINUE +* +* Copy the off diagonal from A to E. Note that E is REAL thus only +* the Real part is needed, the imaginary part should be zero. +* + IF( UPPER ) THEN + DO 160 I = 1, N-1 + E( I ) = ( WORK( OFDPOS+I*LDA ) ) + 160 CONTINUE + ELSE + DO 170 I = 1, N-1 + E( I ) = ( WORK( OFDPOS+(I-1)*LDA ) ) + 170 CONTINUE + ENDIF +* + 200 CONTINUE +* + HOUS( 1 ) = LHMIN + WORK( 1 ) = LWMIN + RETURN +* +* End of DSYTRD_SB2ST +* + END +#undef PRECISION_REAL + diff --git a/SRC/dsytrd_sy2sb.f b/SRC/dsytrd_sy2sb.f new file mode 100644 index 00000000..8f0261df --- /dev/null +++ b/SRC/dsytrd_sy2sb.f @@ -0,0 +1,517 @@ +*> \brief \b DSYTRD_SY2SB +* +* @generated from zhetrd_he2hb.f, fortran z -> d, Sun Nov 6 19:34:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DSYTRD_SY2SB + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsytrd.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsytrd.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsytrd.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DSYTRD_SY2SB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, +* WORK, LWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER UPLO +* INTEGER INFO, LDA, LDAB, LWORK, N, KD +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), AB( LDAB, * ), +* TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSYTRD_SY2SB reduces a real symmetric matrix A to real symmetric +*> band-diagonal form AB by a orthogonal similarity transformation: +*> Q**T * A * Q = AB. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the reduced matrix if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> The reduced matrix is stored in the array AB. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the leading +*> N-by-N upper triangular part of A contains the upper +*> triangular part of the matrix A, and the strictly lower +*> triangular part of A is not referenced. If UPLO = 'L', the +*> leading N-by-N lower triangular part of A contains the lower +*> triangular part of the matrix A, and the strictly upper +*> triangular part of A is not referenced. +*> On exit, if UPLO = 'U', the diagonal and first superdiagonal +*> of A are overwritten by the corresponding elements of the +*> tridiagonal matrix T, and the elements above the first +*> superdiagonal, with the array TAU, represent the orthogonal +*> matrix Q as a product of elementary reflectors; if UPLO +*> = 'L', the diagonal and first subdiagonal of A are over- +*> written by the corresponding elements of the tridiagonal +*> matrix T, and the elements below the first subdiagonal, with +*> the array TAU, represent the orthogonal matrix Q as a product +*> of elementary reflectors. See Further Details. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] AB +*> \verbatim +*> AB is DOUBLE PRECISION array, dimension (LDAB,N) +*> On exit, the upper or lower triangle of the symmetric band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD+1. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (N-KD) +*> The scalar factors of the elementary reflectors (see Further +*> Details). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension LWORK. +*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK(1) returns the size of LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK which should be calculated +* by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> LWORK_QUERY = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD +*> where FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice otherwise +*> putting LWORK=-1 will provide the size of WORK. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup doubleSYcomputational +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +*> \verbatim +*> +*> If UPLO = 'U', the matrix Q is represented as a product of elementary +*> reflectors +*> +*> Q = H(k)**T . . . H(2)**T H(1)**T, where k = n-kd. +*> +*> Each H(i) has the form +*> +*> H(i) = I - tau * v * v**T +*> +*> where tau is a real scalar, and v is a real vector with +*> v(1:i+kd-1) = 0 and v(i+kd) = 1; conjg(v(i+kd+1:n)) is stored on exit in +*> A(i,i+kd+1:n), and tau in TAU(i). +*> +*> If UPLO = 'L', the matrix Q is represented as a product of elementary +*> reflectors +*> +*> Q = H(1) H(2) . . . H(k), where k = n-kd. +*> +*> Each H(i) has the form +*> +*> H(i) = I - tau * v * v**T +*> +*> where tau is a real scalar, and v is a real vector with +*> v(kd+1:i) = 0 and v(i+kd+1) = 1; v(i+kd+2:n) is stored on exit in +* A(i+kd+2:n,i), and tau in TAU(i). +*> +*> The contents of A on exit are illustrated by the following examples +*> with n = 5: +*> +*> if UPLO = 'U': if UPLO = 'L': +*> +*> ( ab ab/v1 v1 v1 v1 ) ( ab ) +*> ( ab ab/v2 v2 v2 ) ( ab/v1 ab ) +*> ( ab ab/v3 v3 ) ( v1 ab/v2 ab ) +*> ( ab ab/v4 ) ( v1 v2 ab/v3 ab ) +*> ( ab ) ( v1 v2 v3 ab/v4 ab ) +*> +*> where d and e denote diagonal and off-diagonal elements of T, and vi +*> denotes an element of the vector defining H(i). +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DSYTRD_SY2SB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, + $ WORK, LWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, LDAB, LWORK, N, KD +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), AB( LDAB, * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION RONE + DOUBLE PRECISION ZERO, ONE, HALF + PARAMETER ( RONE = 1.0D+0, + $ ZERO = 0.0D+0, + $ ONE = 1.0D+0, + $ HALF = 0.5D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, UPPER + INTEGER I, J, IINFO, LWMIN, PN, PK, LK, + $ LDT, LDW, LDS2, LDS1, + $ LS2, LS1, LW, LT, + $ TPOS, WPOS, S2POS, S1POS +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, DSYR2K, DSYMM, DGEMM, + $ DLARFT, DGELQF, DGEQRF, DLASET +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN, MAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. Executable Statements .. +* +* Determine the minimal workspace size required +* and test the input parameters +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) + LWMIN = ILAENV( 20, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) + + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( KD.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LDAB.LT.MAX( 1, KD+1 ) ) THEN + INFO = -7 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DSYTRD_SY2SB', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = LWMIN + RETURN + END IF +* +* Quick return if possible +* Copy the upper/lower portion of A into AB +* + IF( N.LE.KD+1 ) THEN + IF( UPPER ) THEN + DO 100 I = 1, N + LK = MIN( KD+1, I ) + CALL DCOPY( LK, A( I-LK+1, I ), 1, + $ AB( KD+1-LK+1, I ), 1 ) + 100 CONTINUE + ELSE + DO 110 I = 1, N + LK = MIN( KD+1, N-I+1 ) + CALL DCOPY( LK, A( I, I ), 1, AB( 1, I ), 1 ) + 110 CONTINUE + ENDIF + WORK( 1 ) = 1 + RETURN + END IF +* +* Determine the pointer position for the workspace +* + LDT = KD + LDS1 = KD + LT = LDT*KD + LW = N*KD + LS1 = LDS1*KD + LS2 = LWMIN - LT - LW - LS1 +* LS2 = N*MAX(KD,FACTOPTNB) + TPOS = 1 + WPOS = TPOS + LT + S1POS = WPOS + LW + S2POS = S1POS + LS1 + IF( UPPER ) THEN + LDW = KD + LDS2 = KD + ELSE + LDW = N + LDS2 = N + ENDIF +* +* +* Set the workspace of the triangular matrix T to zero once such a +* way everytime T is generated the upper/lower portion will be always zero +* + CALL DLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT ) +* + IF( UPPER ) THEN + DO 10 I = 1, N - KD, KD + PN = N-I-KD+1 + PK = MIN( N-I-KD+1, KD ) +* +* Compute the LQ factorization of the current block +* + CALL DGELQF( KD, PN, A( I, I+KD ), LDA, + $ TAU( I ), WORK( S2POS ), LS2, IINFO ) +* +* Copy the upper portion of A into AB +* + DO 20 J = I, I+PK-1 + LK = MIN( KD, N-J ) + 1 + CALL DCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 ) + 20 CONTINUE +* + CALL DLASET( 'Lower', PK, PK, ZERO, ONE, + $ A( I, I+KD ), LDA ) +* +* Form the matrix T +* + CALL DLARFT( 'Forward', 'Rowwise', PN, PK, + $ A( I, I+KD ), LDA, TAU( I ), + $ WORK( TPOS ), LDT ) +* +* Compute W: +* + CALL DGEMM( 'Conjugate', 'No transpose', PK, PN, PK, + $ ONE, WORK( TPOS ), LDT, + $ A( I, I+KD ), LDA, + $ ZERO, WORK( S2POS ), LDS2 ) +* + CALL DSYMM( 'Right', UPLO, PK, PN, + $ ONE, A( I+KD, I+KD ), LDA, + $ WORK( S2POS ), LDS2, + $ ZERO, WORK( WPOS ), LDW ) +* + CALL DGEMM( 'No transpose', 'Conjugate', PK, PK, PN, + $ ONE, WORK( WPOS ), LDW, + $ WORK( S2POS ), LDS2, + $ ZERO, WORK( S1POS ), LDS1 ) +* + CALL DGEMM( 'No transpose', 'No transpose', PK, PN, PK, + $ -HALF, WORK( S1POS ), LDS1, + $ A( I, I+KD ), LDA, + $ ONE, WORK( WPOS ), LDW ) +* +* +* Update the unreduced submatrix A(i+kd:n,i+kd:n), using +* an update of the form: A := A - V'*W - W'*V +* + CALL DSYR2K( UPLO, 'Conjugate', PN, PK, + $ -ONE, A( I, I+KD ), LDA, + $ WORK( WPOS ), LDW, + $ RONE, A( I+KD, I+KD ), LDA ) + 10 CONTINUE +* +* Copy the upper band to AB which is the band storage matrix +* + DO 30 J = N-KD+1, N + LK = MIN(KD, N-J) + 1 + CALL DCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 ) + 30 CONTINUE +* + ELSE +* +* Reduce the lower triangle of A to lower band matrix +* + DO 40 I = 1, N - KD, KD + PN = N-I-KD+1 + PK = MIN( N-I-KD+1, KD ) +* +* Compute the QR factorization of the current block +* + CALL DGEQRF( PN, KD, A( I+KD, I ), LDA, + $ TAU( I ), WORK( S2POS ), LS2, IINFO ) +* +* Copy the upper portion of A into AB +* + DO 50 J = I, I+PK-1 + LK = MIN( KD, N-J ) + 1 + CALL DCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 ) + 50 CONTINUE +* + CALL DLASET( 'Upper', PK, PK, ZERO, ONE, + $ A( I+KD, I ), LDA ) +* +* Form the matrix T +* + CALL DLARFT( 'Forward', 'Columnwise', PN, PK, + $ A( I+KD, I ), LDA, TAU( I ), + $ WORK( TPOS ), LDT ) +* +* Compute W: +* + CALL DGEMM( 'No transpose', 'No transpose', PN, PK, PK, + $ ONE, A( I+KD, I ), LDA, + $ WORK( TPOS ), LDT, + $ ZERO, WORK( S2POS ), LDS2 ) +* + CALL DSYMM( 'Left', UPLO, PN, PK, + $ ONE, A( I+KD, I+KD ), LDA, + $ WORK( S2POS ), LDS2, + $ ZERO, WORK( WPOS ), LDW ) +* + CALL DGEMM( 'Conjugate', 'No transpose', PK, PK, PN, + $ ONE, WORK( S2POS ), LDS2, + $ WORK( WPOS ), LDW, + $ ZERO, WORK( S1POS ), LDS1 ) +* + CALL DGEMM( 'No transpose', 'No transpose', PN, PK, PK, + $ -HALF, A( I+KD, I ), LDA, + $ WORK( S1POS ), LDS1, + $ ONE, WORK( WPOS ), LDW ) +* +* +* Update the unreduced submatrix A(i+kd:n,i+kd:n), using +* an update of the form: A := A - V*W' - W*V' +* + CALL DSYR2K( UPLO, 'No transpose', PN, PK, + $ -ONE, A( I+KD, I ), LDA, + $ WORK( WPOS ), LDW, + $ RONE, A( I+KD, I+KD ), LDA ) +* ================================================================== +* RESTORE A FOR COMPARISON AND CHECKING TO BE REMOVED +* DO 45 J = I, I+PK-1 +* LK = MIN( KD, N-J ) + 1 +* CALL DCOPY( LK, AB( 1, J ), 1, A( J, J ), 1 ) +* 45 CONTINUE +* ================================================================== + 40 CONTINUE +* +* Copy the lower band to AB which is the band storage matrix +* + DO 60 J = N-KD+1, N + LK = MIN(KD, N-J) + 1 + CALL DCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 ) + 60 CONTINUE + + END IF +* + WORK( 1 ) = LWMIN + RETURN +* +* End of DSYTRD_SY2SB +* + END diff --git a/SRC/ilaenv.f b/SRC/ilaenv.f index 42a380cf..c66f1679 100644 --- a/SRC/ilaenv.f +++ b/SRC/ilaenv.f @@ -189,7 +189,8 @@ * .. Executable Statements .. * GO TO ( 10, 10, 10, 80, 90, 100, 110, 120, - $ 130, 140, 150, 160, 160, 160, 160, 160 )ISPEC + $ 130, 140, 150, 160, 160, 160, 160, 160, + $ 170, 170, 170, 170, 170 )ISPEC * * Invalid value for ISPEC * @@ -690,6 +691,13 @@ ILAENV = IPARMQ( ISPEC, NAME, OPTS, N1, N2, N3, N4 ) RETURN * + 170 CONTINUE +* +* 17 <= ISPEC <= 21: 2stage eigenvalues and SVD or related subroutines. +* + ILAENV = IPARAM2STAGE( ISPEC, NAME, OPTS, N1, N2, N3, N4 ) + RETURN +* * End of ILAENV * END diff --git a/SRC/iparam2stage.F b/SRC/iparam2stage.F new file mode 100644 index 00000000..6443f16e --- /dev/null +++ b/SRC/iparam2stage.F @@ -0,0 +1,388 @@ +*> \brief \b IPARAM2STAGE +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download IPARAM2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/iparam2stage.F"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/iparam2stage.F"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/iparam2stage.F"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* INTEGER FUNCTION IPARAM2STAGE( ISPEC, NAME, OPTS, +* NI, NBI, IBI, NXI ) +* #if defined(_OPENMP) +* use omp_lib +* #endif +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER*( * ) NAME, OPTS +* INTEGER ISPEC, NI, NBI, IBI, NXI +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> This program sets problem and machine dependent parameters +*> useful for xHETRD_2STAGE, xHETRD_H@2HB, xHETRD_HB2ST, +*> xGEBRD_2STAGE, xGEBRD_GE2GB, xGEBRD_GB2BD +*> and related subroutines for eigenvalue problems. +*> It is called whenever ILAENV is called with 17 <= ISPEC <= 21 +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] ISPEC +*> \verbatim +*> ISPEC is integer scalar +*> ISPEC specifies which tunable parameter IPARAM2STAGE should +*> return. +*> +*> ISPEC=17: the optimal blocksize nb for the reduction to +* BAND +*> +*> ISPEC=18: the optimal blocksize ib for the eigenvectors +*> singular vectors update routine +*> +*> ISPEC=19: The length of the array that store the Housholder +*> representation for the second stage +*> Band to Tridiagonal or Bidiagonal +*> +*> ISPEC=20: The workspace needed for the routine in input. +*> +*> ISPEC=21: For future release. +*> \endverbatim +*> +*> \param[in] NAME +*> \verbatim +*> NAME is character string +*> Name of the calling subroutine +*> \endverbatim +*> +*> \param[in] OPTS +*> \verbatim +*> OPTS is CHARACTER*(*) +*> The character options to the subroutine NAME, concatenated +*> into a single character string. For example, UPLO = 'U', +*> TRANS = 'T', and DIAG = 'N' for a triangular routine would +*> be specified as OPTS = 'UTN'. +*> \endverbatim +*> +*> \param[in] NI +*> \verbatim +*> NI is INTEGER which is the size of the matrix +*> \endverbatim +*> +*> \param[in] NBI +*> \verbatim +*> NBI is INTEGER which is the used in the reduciton, +* (e.g., the size of the band), needed to compute workspace +* and LHOUS2. +*> \endverbatim +*> +*> \param[in] IBI +*> \verbatim +*> IBI is INTEGER which represent the IB of the reduciton, +* needed to compute workspace and LHOUS2. +*> \endverbatim +*> +*> \param[in] NXI +*> \verbatim +*> NXI is INTEGER needed in the future release. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup auxOTHERauxiliary +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All detail are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + INTEGER FUNCTION IPARAM2STAGE( ISPEC, NAME, OPTS, + $ NI, NBI, IBI, NXI ) +#if defined(_OPENMP) + use omp_lib +#endif + IMPLICIT NONE +* +* -- LAPACK auxiliary routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER*( * ) NAME, OPTS + INTEGER ISPEC, NI, NBI, IBI, NXI +* +* ================================================================ +* .. +* .. Local Scalars .. + INTEGER I, IC, IZ, KD, IB, LHOUS, LWORK, NTHREADS, + $ FACTOPTNB, QROPTNB, LQOPTNB + LOGICAL RPREC, CPREC + CHARACTER PREC*1, ALGO*3, STAG*5, SUBNAM*12, VECT*3 +* .. +* .. Intrinsic Functions .. + INTRINSIC CHAR, ICHAR, MAX +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Executable Statements .. +* +* Invalid value for ISPEC +* + IF( (ISPEC.LT.17).OR.(ISPEC.GT.21) ) THEN + IPARAM2STAGE = -1 + RETURN + ENDIF +* +* Get the number of threads +* + NTHREADS = 1 +#if defined(_OPENMP) +!$OMP PARALLEL + NTHREADS = OMP_GET_NUM_THREADS() +!$OMP END PARALLEL +#endif +* WRITE(*,*) 'IPARAM VOICI NTHREADS ISPEC ',NTHREADS, ISPEC + IF( ISPEC.EQ.19 ) GOTO 19 +* +* Convert NAME to upper case if the first character is lower case. +* + IPARAM2STAGE = -1 + SUBNAM = NAME + IC = ICHAR( SUBNAM( 1: 1 ) ) + IZ = ICHAR( 'Z' ) + IF( IZ.EQ.90 .OR. IZ.EQ.122 ) THEN +* +* ASCII character set +* + IF( IC.GE.97 .AND. IC.LE.122 ) THEN + SUBNAM( 1: 1 ) = CHAR( IC-32 ) + DO 100 I = 2, 12 + IC = ICHAR( SUBNAM( I: I ) ) + IF( IC.GE.97 .AND. IC.LE.122 ) + $ SUBNAM( I: I ) = CHAR( IC-32 ) + 100 CONTINUE + END IF +* + ELSE IF( IZ.EQ.233 .OR. IZ.EQ.169 ) THEN +* +* EBCDIC character set +* + IF( ( IC.GE.129 .AND. IC.LE.137 ) .OR. + $ ( IC.GE.145 .AND. IC.LE.153 ) .OR. + $ ( IC.GE.162 .AND. IC.LE.169 ) ) THEN + SUBNAM( 1: 1 ) = CHAR( IC+64 ) + DO 110 I = 2, 12 + IC = ICHAR( SUBNAM( I: I ) ) + IF( ( IC.GE.129 .AND. IC.LE.137 ) .OR. + $ ( IC.GE.145 .AND. IC.LE.153 ) .OR. + $ ( IC.GE.162 .AND. IC.LE.169 ) )SUBNAM( I: + $ I ) = CHAR( IC+64 ) + 110 CONTINUE + END IF +* + ELSE IF( IZ.EQ.218 .OR. IZ.EQ.250 ) THEN +* +* Prime machines: ASCII+128 +* + IF( IC.GE.225 .AND. IC.LE.250 ) THEN + SUBNAM( 1: 1 ) = CHAR( IC-32 ) + DO 120 I = 2, 12 + IC = ICHAR( SUBNAM( I: I ) ) + IF( IC.GE.225 .AND. IC.LE.250 ) + $ SUBNAM( I: I ) = CHAR( IC-32 ) + 120 CONTINUE + END IF + END IF +* + PREC = SUBNAM( 1: 1 ) + ALGO = SUBNAM( 4: 6 ) + STAG = SUBNAM( 8:12 ) + RPREC = PREC.EQ.'S' .OR. PREC.EQ.'D' + CPREC = PREC.EQ.'C' .OR. PREC.EQ.'Z' +* +* Invalid value for PRECISION +* + IF( .NOT.( RPREC .OR. CPREC ) ) THEN + IPARAM2STAGE = -1 + RETURN + ENDIF +* WRITE(*,*),'RPREC,CPREC ',RPREC,CPREC, +* $ ' ALGO ',ALGO,' STAGE ',STAG +* + GO TO ( 17, 17, 19, 20, 21 ) ISPEC-16 +* + 17 CONTINUE +* +* ISPEC = 17, 18: block size KD, IB +* Could be also dependent from N but for now it +* depend only on sequential or parallel +* + IF( NTHREADS.GT.4 ) THEN + IF( CPREC ) THEN + KD = 128 + IB = 32 + ELSE + KD = 160 + IB = 40 + ENDIF + ELSE IF( NTHREADS.GT.1 ) THEN + IF( CPREC ) THEN + KD = 64 + IB = 32 + ELSE + KD = 64 + IB = 32 + ENDIF + ELSE + IF( CPREC ) THEN + KD = 16 + IB = 16 + ELSE + KD = 32 + IB = 16 + ENDIF + ENDIF + IF( ISPEC.EQ.17 ) IPARAM2STAGE = KD + IF( ISPEC.EQ.18 ) IPARAM2STAGE = IB + RETURN +* + 19 CONTINUE +* +* ISPEC = 19: +* LHOUS length of the Houselholder representation +* matrix (V,T) of the second stage. should be >= 1. +* +* Will add the VECT OPTION HERE next release + VECT = OPTS(1:1) + IF( VECT.EQ.'N' ) THEN + LHOUS = MAX( 1, 4*NI ) + ELSE +* This is not correct, it need to call the ALGO and the stage2 + LHOUS = MAX( 1, 4*NI ) + IBI + ENDIF + IF( LHOUS.GE.0 ) THEN + IPARAM2STAGE = LHOUS + ELSE + IPARAM2STAGE = -1 + ENDIF + RETURN +* + 20 CONTINUE +* +* ISPEC = 20: (21 for future use) +* LWORK length of the workspace for +* either or both stages for TRD and BRD. should be >= 1. +* TRD: +* TRD_stage 1: = LT + LW + LS1 + LS2 +* = LDT*KD + N*KD + N*MAX(KD,FACTOPTNB) + LDS2*KD +* where LDT=LDS2=KD +* = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD +* TRD_stage 2: = (2NB+1)*N + KD*NTHREADS +* TRD_both : = max(stage1,stage2) + AB ( AB=(KD+1)*N ) +* = N*KD + N*max(KD+1,FACTOPTNB) +* + max(2*KD*KD, KD*NTHREADS) +* + (KD+1)*N + LWORK = -1 + SUBNAM(1:1) = PREC + SUBNAM(2:6) = 'GEQRF' + QROPTNB = ILAENV( 1, SUBNAM, ' ', NI, NBI, -1, -1 ) + SUBNAM(2:6) = 'GELQF' + LQOPTNB = ILAENV( 1, SUBNAM, ' ', NBI, NI, -1, -1 ) +* Could be QR or LQ for TRD and the max for BRD + FACTOPTNB = MAX(QROPTNB, LQOPTNB) + IF( ALGO.EQ.'TRD' ) THEN + IF( STAG.EQ.'2STAG' ) THEN + LWORK = NI*NBI + NI*MAX(NBI+1,FACTOPTNB) + $ + MAX(2*NBI*NBI, NBI*NTHREADS) + $ + (NBI+1)*NI + ELSE IF( (STAG.EQ.'HE2HB').OR.(STAG.EQ.'SY2SB') ) THEN + LWORK = NI*NBI + NI*MAX(NBI,FACTOPTNB) + 2*NBI*NBI + ELSE IF( (STAG.EQ.'HB2ST').OR.(STAG.EQ.'SB2ST') ) THEN + LWORK = (2*NBI+1)*NI + NBI*NTHREADS + ENDIF + ELSE IF( ALGO.EQ.'BRD' ) THEN + IF( STAG.EQ.'2STAG' ) THEN + LWORK = 2*NI*NBI + NI*MAX(NBI+1,FACTOPTNB) + $ + MAX(2*NBI*NBI, NBI*NTHREADS) + $ + (NBI+1)*NI + ELSE IF( STAG.EQ.'GE2GB' ) THEN + LWORK = NI*NBI + NI*MAX(NBI,FACTOPTNB) + 2*NBI*NBI + ELSE IF( STAG.EQ.'GB2BD' ) THEN + LWORK = (3*NBI+1)*NI + NBI*NTHREADS + ENDIF + ENDIF + LWORK = MAX ( 1, LWORK ) + + IF( LWORK.GT.0 ) THEN + IPARAM2STAGE = LWORK + ELSE + IPARAM2STAGE = -1 + ENDIF + RETURN +* + 21 CONTINUE +* +* ISPEC = 21 for future use + IPARAM2STAGE = NXI + RETURN +* +* ==== End of IPARAM2STAGE ==== +* + END diff --git a/SRC/slarfy.f b/SRC/slarfy.f new file mode 100644 index 00000000..19a7fa6d --- /dev/null +++ b/SRC/slarfy.f @@ -0,0 +1,161 @@ +*> \brief \b SLARFY +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK ) +* +* .. Scalar Arguments .. +* CHARACTER UPLO +* INTEGER INCV, LDC, N +* REAL TAU +* .. +* .. Array Arguments .. +* REAL C( LDC, * ), V( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLARFY applies an elementary reflector, or Householder matrix, H, +*> to an n x n symmetric matrix C, from both the left and the right. +*> +*> H is represented in the form +*> +*> H = I - tau * v * v' +*> +*> where tau is a scalar and v is a vector. +*> +*> If tau is zero, then H is taken to be the unit matrix. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the upper or lower triangular part of the +*> symmetric matrix C is stored. +*> = 'U': Upper triangle +*> = 'L': Lower triangle +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of rows and columns of the matrix C. N >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is REAL array, dimension +*> (1 + (N-1)*abs(INCV)) +*> The vector v as described above. +*> \endverbatim +*> +*> \param[in] INCV +*> \verbatim +*> INCV is INTEGER +*> The increment between successive elements of v. INCV must +*> not be zero. +*> \endverbatim +*> +*> \param[in] TAU +*> \verbatim +*> TAU is REAL +*> The value tau as described above. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (LDC, N) +*> On entry, the matrix C. +*> On exit, C is overwritten by H * C * H'. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max( 1, N ). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (N) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2011 +* +*> \ingroup single_eig +* +* ===================================================================== + SUBROUTINE SLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK ) +* +* -- LAPACK test routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2011 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCV, LDC, N + REAL TAU +* .. +* .. Array Arguments .. + REAL C( LDC, * ), V( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO, HALF + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0, HALF = 0.5E+0 ) +* .. +* .. Local Scalars .. + REAL ALPHA +* .. +* .. External Subroutines .. + EXTERNAL SAXPY, SSYMV, SSYR2 +* .. +* .. External Functions .. + REAL SDOT + EXTERNAL SDOT +* .. +* .. Executable Statements .. +* + IF( TAU.EQ.ZERO ) + $ RETURN +* +* Form w:= C * v +* + CALL SSYMV( UPLO, N, ONE, C, LDC, V, INCV, ZERO, WORK, 1 ) +* + ALPHA = -HALF*TAU*SDOT( N, WORK, 1, V, INCV ) + CALL SAXPY( N, ALPHA, V, INCV, WORK, 1 ) +* +* C := C - v * w' - w * v' +* + CALL SSYR2( UPLO, N, -TAU, V, INCV, WORK, 1, C, LDC ) +* + RETURN +* +* End of SLARFY +* + END diff --git a/SRC/ssb2st_kernels.f b/SRC/ssb2st_kernels.f new file mode 100644 index 00000000..60058dda --- /dev/null +++ b/SRC/ssb2st_kernels.f @@ -0,0 +1,320 @@ +*> \brief \b SSB2ST_KERNELS +* +* @generated from zhb2st_kernels.f, fortran z -> s, Sun Nov 6 19:34:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SSB2ST_KERNELS + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssb2st_kernels.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssb2st_kernels.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssb2st_kernels.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SSB2ST_KERNELS( UPLO, WANTZ, TTYPE, +* ST, ED, SWEEP, N, NB, IB, +* A, LDA, V, TAU, LDVT, WORK) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER UPLO +* LOGICAL WANTZ +* INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), V( * ), +* TAU( * ), WORK( * ) +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSB2ST_KERNELS is an internal routine used by the SSYTRD_SB2ST +*> subroutine. +*> \endverbatim +* +* Arguments: +* ========== +* +*> @param[in] n +*> The order of the matrix A. +*> +*> @param[in] nb +*> The size of the band. +*> +*> @param[in, out] A +*> A pointer to the matrix A. +*> +*> @param[in] lda +*> The leading dimension of the matrix A. +*> +*> @param[out] V +*> REAL array, dimension 2*n if eigenvalues only are +*> requested or to be queried for vectors. +*> +*> @param[out] TAU +*> REAL array, dimension (2*n). +*> The scalar factors of the Householder reflectors are stored +*> in this array. +*> +*> @param[in] st +*> internal parameter for indices. +*> +*> @param[in] ed +*> internal parameter for indices. +*> +*> @param[in] sweep +*> internal parameter for indices. +*> +*> @param[in] Vblksiz +*> internal parameter for indices. +*> +*> @param[in] wantz +*> logical which indicate if Eigenvalue are requested or both +*> Eigenvalue/Eigenvectors. +*> +*> @param[in] work +*> Workspace of size nb. +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SSB2ST_KERNELS( UPLO, WANTZ, TTYPE, + $ ST, ED, SWEEP, N, NB, IB, + $ A, LDA, V, TAU, LDVT, WORK) +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER UPLO + LOGICAL WANTZ + INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT +* .. +* .. Array Arguments .. + REAL A( LDA, * ), V( * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, + $ ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, J1, J2, LM, LN, VPOS, TAUPOS, + $ DPOS, OFDPOS, AJETER + REAL CTMP +* .. +* .. External Subroutines .. + EXTERNAL SLARFG, SLARFX, SLARFY +* .. +* .. Intrinsic Functions .. + INTRINSIC MOD +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. +* .. Executable Statements .. +* + AJETER = IB + LDVT + UPPER = LSAME( UPLO, 'U' ) + + IF( UPPER ) THEN + DPOS = 2 * NB + 1 + OFDPOS = 2 * NB + ELSE + DPOS = 1 + OFDPOS = 2 + ENDIF + +* +* Upper case +* + IF( UPPER ) THEN +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ENDIF + GO TO ( 101, 102, 103 ) TTYPE +* + 101 CONTINUE + LM = ED - ST + 1 +* + V( VPOS ) = ONE + DO 10 I = 1, LM-1 + V( VPOS+I ) = ( A( OFDPOS-I, ST+I ) ) + A( OFDPOS-I, ST+I ) = ZERO + 10 CONTINUE + CTMP = ( A( OFDPOS, ST ) ) + CALL SLARFG( LM, CTMP, V( VPOS+1 ), 1, + $ TAU( TAUPOS ) ) + A( OFDPOS, ST ) = CTMP +* + 103 CONTINUE + LM = ED - ST + 1 + CALL SLARFY( UPLO, LM, V( VPOS ), 1, ( TAU( TAUPOS ) ), + $ A( DPOS, ST ), LDA-1, WORK) + GOTO 300 +* + 102 CONTINUE + J1 = ED+1 + J2 = MIN( ED+NB, N ) + LN = ED-ST+1 + LM = J2-J1+1 + IF( LM.GT.0) THEN + CALL SLARFX( 'Left', LN, LM, V( VPOS ), + $ ( TAU( TAUPOS ) ), A( DPOS-NB, J1 ), + $ LDA-1, WORK) +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ENDIF +* + V( VPOS ) = ONE + DO 30 I = 1, LM-1 + V( VPOS+I ) = ( A( DPOS-NB-I, J1+I ) ) + A( DPOS-NB-I, J1+I ) = ZERO + 30 CONTINUE + CTMP = ( A( DPOS-NB, J1 ) ) + CALL SLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) ) + A( DPOS-NB, J1 ) = CTMP +* + CALL SLARFX( 'Right', LN-1, LM, V( VPOS ), + $ TAU( TAUPOS ), + $ A( DPOS-NB+1, J1 ), LDA-1, WORK) + ENDIF + GOTO 300 +* +* Lower case +* + ELSE +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ENDIF + GO TO ( 201, 202, 203 ) TTYPE +* + 201 CONTINUE + LM = ED - ST + 1 +* + V( VPOS ) = ONE + DO 20 I = 1, LM-1 + V( VPOS+I ) = A( OFDPOS+I, ST-1 ) + A( OFDPOS+I, ST-1 ) = ZERO + 20 CONTINUE + CALL SLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, + $ TAU( TAUPOS ) ) +* + 203 CONTINUE + LM = ED - ST + 1 +* + CALL SLARFY( UPLO, LM, V( VPOS ), 1, ( TAU( TAUPOS ) ), + $ A( DPOS, ST ), LDA-1, WORK) + + GOTO 300 +* + 202 CONTINUE + J1 = ED+1 + J2 = MIN( ED+NB, N ) + LN = ED-ST+1 + LM = J2-J1+1 +* + IF( LM.GT.0) THEN + CALL SLARFX( 'Right', LM, LN, V( VPOS ), + $ TAU( TAUPOS ), A( DPOS+NB, ST ), + $ LDA-1, WORK) +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ENDIF +* + V( VPOS ) = ONE + DO 40 I = 1, LM-1 + V( VPOS+I ) = A( DPOS+NB+I, ST ) + A( DPOS+NB+I, ST ) = ZERO + 40 CONTINUE + CALL SLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, + $ TAU( TAUPOS ) ) +* + CALL SLARFX( 'Left', LM, LN-1, V( VPOS ), + $ ( TAU( TAUPOS ) ), + $ A( DPOS+NB-1, ST+1 ), LDA-1, WORK) + + ENDIF + GOTO 300 + ENDIF + + 300 CONTINUE + RETURN +* +* END OF SSB2ST_KERNELS +* + END diff --git a/SRC/ssbev_2stage.f b/SRC/ssbev_2stage.f new file mode 100644 index 00000000..821c00a3 --- /dev/null +++ b/SRC/ssbev_2stage.f @@ -0,0 +1,377 @@ +*> \brief <b> SSBEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b> +* +* @generated from dsbev_2stage.f, fortran d -> s, Sat Nov 5 23:58:09 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SSBEV_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssbev_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssbev_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssbev_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SSBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, +* WORK, LWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, KD, LDAB, LDZ, N, LWORK +* .. +* .. Array Arguments .. +* REAL AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSBEV_2STAGE computes all the eigenvalues and, optionally, eigenvectors of +*> a real symmetric band matrix A using the 2stage technique for +*> the reduction to tridiagonal. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is REAL array, dimension (LDAB, N) +*> On entry, the upper or lower triangle of the symmetric band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> +*> On exit, AB is overwritten by values generated during the +*> reduction to tridiagonal form. If UPLO = 'U', the first +*> superdiagonal and the diagonal of the tridiagonal matrix T +*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L', +*> the diagonal and first subdiagonal of T are returned in the +*> first two rows of AB. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD + 1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is REAL array, dimension (LDZ, N) +*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal +*> eigenvectors of the matrix A, with the i-th column of Z +*> holding the eigenvector associated with W(i). +*> If JOBZ = 'N', then Z is not referenced. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension LWORK +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS + N +*> where KD is the size of the band. +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the algorithm failed to converge; i +*> off-diagonal elements of an intermediate tridiagonal +*> form did not converge to zero. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup realOTHEReigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SSBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, + $ WORK, LWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, KD, LDAB, LDZ, N, LWORK +* .. +* .. Array Arguments .. + REAL AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, WANTZ, LQUERY + INTEGER IINFO, IMAX, INDE, INDWRK, ISCALE, + $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS + REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANSB + EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SLASCL, SSCAL, SSTEQR, SSTERF, XERBLA + $ SSYTRD_SB2ST +* .. +* .. Intrinsic Functions .. + INTRINSIC SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KD.LT.0 ) THEN + INFO = -4 + ELSE IF( LDAB.LT.KD+1 ) THEN + INFO = -6 + ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -9 + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + WORK( 1 ) = LWMIN + ELSE + IB = ILAENV( 18, 'SSYTRD_SB2ST', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 ) + LWMIN = N + LHTRD + LWTRD + WORK( 1 ) = LWMIN + ENDIF +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -11 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSBEV_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + IF( LOWER ) THEN + W( 1 ) = AB( 1, 1 ) + ELSE + W( 1 ) = AB( KD+1, 1 ) + END IF + IF( WANTZ ) + $ Z( 1, 1 ) = ONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = SLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + CALL SLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + ELSE + CALL SLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + END IF + END IF +* +* Call SSYTRD_SB2ST to reduce symmetric band matrix to tridiagonal form. +* + INDE = 1 + INDHOUS = INDE + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, + $ WORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call SSTERF. For eigenvectors, call SSTEQR. +* + IF( .NOT.WANTZ ) THEN + CALL SSTERF( N, W, WORK( INDE ), INFO ) + ELSE + CALL SSTEQR( JOBZ, N, W, WORK( INDE ), Z, LDZ, WORK( INDWRK ), + $ INFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = N + ELSE + IMAX = INFO - 1 + END IF + CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of SSBEV_2STAGE +* + END diff --git a/SRC/ssbevd_2stage.f b/SRC/ssbevd_2stage.f new file mode 100644 index 00000000..8a306306 --- /dev/null +++ b/SRC/ssbevd_2stage.f @@ -0,0 +1,412 @@ +*> \brief <b> SSBEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b> +* +* @generated from dsbevd_2stage.f, fortran d -> s, Sat Nov 5 23:58:03 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SSBEVD_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssbevd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssbevd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssbevd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SSBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, +* WORK, LWORK, IWORK, LIWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, KD, LDAB, LDZ, LIWORK, LWORK, N +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ) +* REAL AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSBEVD_2STAGE computes all the eigenvalues and, optionally, eigenvectors of +*> a real symmetric band matrix A using the 2stage technique for +*> the reduction to tridiagonal. If eigenvectors are desired, it uses +*> a divide and conquer algorithm. +*> +*> The divide and conquer algorithm makes very mild assumptions about +*> floating point arithmetic. It will work on machines with a guard +*> digit in add/subtract, or on those binary machines without guard +*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or +*> Cray-2. It could conceivably fail on hexadecimal or decimal machines +*> without guard digits, but we know of none. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is REAL array, dimension (LDAB, N) +*> On entry, the upper or lower triangle of the symmetric band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> +*> On exit, AB is overwritten by values generated during the +*> reduction to tridiagonal form. If UPLO = 'U', the first +*> superdiagonal and the diagonal of the tridiagonal matrix T +*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L', +*> the diagonal and first subdiagonal of T are returned in the +*> first two rows of AB. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD + 1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is REAL array, dimension (LDZ, N) +*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal +*> eigenvectors of the matrix A, with the i-th column of Z +*> holding the eigenvector associated with W(i). +*> If JOBZ = 'N', then Z is not referenced. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension LWORK +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS + N +*> where KD is the size of the band. +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK and IWORK +*> arrays, returns these values as the first entries of the WORK +*> and IWORK arrays, and no error message related to LWORK or +*> LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. +*> If JOBZ = 'N' or N <= 1, LIWORK must be at least 1. +*> If JOBZ = 'V' and N > 2, LIWORK must be at least 3 + 5*N. +*> +*> If LIWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK and IWORK arrays, and no error message related to +*> LWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the algorithm failed to converge; i +*> off-diagonal elements of an intermediate tridiagonal +*> form did not converge to zero. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup realOTHEReigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SSBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, + $ WORK, LWORK, IWORK, LIWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, KD, LDAB, LDZ, LIWORK, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + REAL AB( LDAB, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, LQUERY, WANTZ + INTEGER IINFO, INDE, INDWK2, INDWRK, ISCALE, LIWMIN, + $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS, + $ LLWRK2 + REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANSB + EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SLACPY, SLASCL, SSCAL, SSTEDC, + $ SSTERF, XERBLA, SSYTRD_SB2ST +* .. +* .. Intrinsic Functions .. + INTRINSIC SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) +* + INFO = 0 + IF( N.LE.1 ) THEN + LIWMIN = 1 + LWMIN = 1 + ELSE + IB = ILAENV( 18, 'SSYTRD_SB2ST', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 ) + IF( WANTZ ) THEN + LIWMIN = 3 + 5*N + LWMIN = 1 + 5*N + 2*N**2 + ELSE + LIWMIN = 1 + LWMIN = MAX( 2*N, N+LHTRD+LWTRD ) + END IF + END IF + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KD.LT.0 ) THEN + INFO = -4 + ELSE IF( LDAB.LT.KD+1 ) THEN + INFO = -6 + ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -9 + END IF +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + IWORK( 1 ) = LIWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN + INFO = -13 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSBEVD_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + W( 1 ) = AB( 1, 1 ) + IF( WANTZ ) + $ Z( 1, 1 ) = ONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = SLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + CALL SLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + ELSE + CALL SLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + END IF + END IF +* +* Call SSYTRD_SB2ST to reduce band symmetric matrix to tridiagonal form. +* + INDE = 1 + INDHOUS = INDE + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 + INDWK2 = INDWRK + N*N + LLWRK2 = LWORK - INDWK2 + 1 +* + CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, + $ WORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call SSTERF. For eigenvectors, call SSTEDC. +* + IF( .NOT.WANTZ ) THEN + CALL SSTERF( N, W, WORK( INDE ), INFO ) + ELSE + CALL SSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N, + $ WORK( INDWK2 ), LLWRK2, IWORK, LIWORK, INFO ) + CALL SGEMM( 'N', 'N', N, N, N, ONE, Z, LDZ, WORK( INDWRK ), N, + $ ZERO, WORK( INDWK2 ), N ) + CALL SLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) + $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) +* + WORK( 1 ) = LWMIN + IWORK( 1 ) = LIWMIN + RETURN +* +* End of SSBEVD_2STAGE +* + END diff --git a/SRC/ssbevx_2stage.f b/SRC/ssbevx_2stage.f new file mode 100644 index 00000000..d3a588c4 --- /dev/null +++ b/SRC/ssbevx_2stage.f @@ -0,0 +1,633 @@ +*> \brief <b> SSBEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b> +* +* @generated from dsbevx_2stage.f, fortran d -> s, Sat Nov 5 23:58:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SSBEVX_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssbevx_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssbevx_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssbevx_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SSBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, Q, +* LDQ, VL, VU, IL, IU, ABSTOL, M, W, Z, +* LDZ, WORK, LWORK, IWORK, IFAIL, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, RANGE, UPLO +* INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK +* REAL ABSTOL, VL, VU +* .. +* .. Array Arguments .. +* INTEGER IFAIL( * ), IWORK( * ) +* REAL AB( LDAB, * ), Q( LDQ, * ), W( * ), WORK( * ), +* $ Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSBEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors +*> of a real symmetric band matrix A using the 2stage technique for +*> the reduction to tridiagonal. Eigenvalues and eigenvectors can +*> be selected by specifying either a range of values or a range of +*> indices for the desired eigenvalues. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] RANGE +*> \verbatim +*> RANGE is CHARACTER*1 +*> = 'A': all eigenvalues will be found; +*> = 'V': all eigenvalues in the half-open interval (VL,VU] +*> will be found; +*> = 'I': the IL-th through IU-th eigenvalues will be found. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is REAL array, dimension (LDAB, N) +*> On entry, the upper or lower triangle of the symmetric band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> +*> On exit, AB is overwritten by values generated during the +*> reduction to tridiagonal form. If UPLO = 'U', the first +*> superdiagonal and the diagonal of the tridiagonal matrix T +*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L', +*> the diagonal and first subdiagonal of T are returned in the +*> first two rows of AB. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD + 1. +*> \endverbatim +*> +*> \param[out] Q +*> \verbatim +*> Q is REAL array, dimension (LDQ, N) +*> If JOBZ = 'V', the N-by-N orthogonal matrix used in the +*> reduction to tridiagonal form. +*> If JOBZ = 'N', the array Q is not referenced. +*> \endverbatim +*> +*> \param[in] LDQ +*> \verbatim +*> LDQ is INTEGER +*> The leading dimension of the array Q. If JOBZ = 'V', then +*> LDQ >= max(1,N). +*> \endverbatim +*> +*> \param[in] VL +*> \verbatim +*> VL is REAL +*> If RANGE='V', the lower bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] VU +*> \verbatim +*> VU is REAL +*> If RANGE='V', the upper bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] IL +*> \verbatim +*> IL is INTEGER +*> If RANGE='I', the index of the +*> smallest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] IU +*> \verbatim +*> IU is INTEGER +*> If RANGE='I', the index of the +*> largest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL +*> The absolute error tolerance for the eigenvalues. +*> An approximate eigenvalue is accepted as converged +*> when it is determined to lie in an interval [a,b] +*> of width less than or equal to +*> +*> ABSTOL + EPS * max( |a|,|b| ) , +*> +*> where EPS is the machine precision. If ABSTOL is less than +*> or equal to zero, then EPS*|T| will be used in its place, +*> where |T| is the 1-norm of the tridiagonal matrix obtained +*> by reducing AB to tridiagonal form. +*> +*> Eigenvalues will be computed most accurately when ABSTOL is +*> set to twice the underflow threshold 2*SLAMCH('S'), not zero. +*> If this routine returns with INFO>0, indicating that some +*> eigenvectors did not converge, try setting ABSTOL to +*> 2*SLAMCH('S'). +*> +*> See "Computing Small Singular Values of Bidiagonal Matrices +*> with Guaranteed High Relative Accuracy," by Demmel and +*> Kahan, LAPACK Working Note #3. +*> \endverbatim +*> +*> \param[out] M +*> \verbatim +*> M is INTEGER +*> The total number of eigenvalues found. 0 <= M <= N. +*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> The first M elements contain the selected eigenvalues in +*> ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is REAL array, dimension (LDZ, max(1,M)) +*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z +*> contain the orthonormal eigenvectors of the matrix A +*> corresponding to the selected eigenvalues, with the i-th +*> column of Z holding the eigenvector associated with W(i). +*> If an eigenvector fails to converge, then that column of Z +*> contains the latest approximation to the eigenvector, and the +*> index of the eigenvector is returned in IFAIL. +*> If JOBZ = 'N', then Z is not referenced. +*> Note: the user must ensure that at least max(1,M) columns are +*> supplied in the array Z; if RANGE = 'V', the exact value of M +*> is not known in advance and an upper bound must be used. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (LWORK) +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, 7*N, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS + 2*N +*> where KD is the size of the band. +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (5*N) +*> \endverbatim +*> +*> \param[out] IFAIL +*> \verbatim +*> IFAIL is INTEGER array, dimension (N) +*> If JOBZ = 'V', then if INFO = 0, the first M elements of +*> IFAIL are zero. If INFO > 0, then IFAIL contains the +*> indices of the eigenvectors that failed to converge. +*> If JOBZ = 'N', then IFAIL is not referenced. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> > 0: if INFO = i, then i eigenvectors failed to converge. +*> Their indices are stored in array IFAIL. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup realOTHEReigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SSBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, Q, + $ LDQ, VL, VU, IL, IU, ABSTOL, M, W, Z, + $ LDZ, WORK, LWORK, IWORK, IFAIL, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, RANGE, UPLO + INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK + REAL ABSTOL, VL, VU +* .. +* .. Array Arguments .. + INTEGER IFAIL( * ), IWORK( * ) + REAL AB( LDAB, * ), Q( LDQ, * ), W( * ), WORK( * ), + $ Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) +* .. +* .. Local Scalars .. + LOGICAL ALLEIG, INDEIG, LOWER, TEST, VALEIG, WANTZ, + $ LQUERY + CHARACTER ORDER + INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, + $ INDISP, INDIWO, INDWRK, ISCALE, ITMP1, J, JJ, + $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS, + $ NSPLIT + REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, + $ SIGMA, SMLNUM, TMP1, VLL, VUU +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANSB + EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SGEMV, SLACPY, SLASCL, SSCAL, + $ SSTEBZ, SSTEIN, SSTEQR, SSTERF, SSWAP, XERBLA, + $ SSYTRD_SB2ST +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + ALLEIG = LSAME( RANGE, 'A' ) + VALEIG = LSAME( RANGE, 'V' ) + INDEIG = LSAME( RANGE, 'I' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN + INFO = -2 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( KD.LT.0 ) THEN + INFO = -5 + ELSE IF( LDAB.LT.KD+1 ) THEN + INFO = -7 + ELSE IF( WANTZ .AND. LDQ.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE + IF( VALEIG ) THEN + IF( N.GT.0 .AND. VU.LE.VL ) + $ INFO = -11 + ELSE IF( INDEIG ) THEN + IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN + INFO = -12 + ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN + INFO = -13 + END IF + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) + $ INFO = -18 + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + WORK( 1 ) = LWMIN + ELSE + IB = ILAENV( 18, 'SSYTRD_SB2ST', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'SSYTRD_SB2ST', JOBZ, N, KD, IB, -1 ) + LWMIN = 2*N + LHTRD + LWTRD + WORK( 1 ) = LWMIN + ENDIF +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -20 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSBEVX_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + M = 0 + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + M = 1 + IF( LOWER ) THEN + TMP1 = AB( 1, 1 ) + ELSE + TMP1 = AB( KD+1, 1 ) + END IF + IF( VALEIG ) THEN + IF( .NOT.( VL.LT.TMP1 .AND. VU.GE.TMP1 ) ) + $ M = 0 + END IF + IF( M.EQ.1 ) THEN + W( 1 ) = TMP1 + IF( WANTZ ) + $ Z( 1, 1 ) = ONE + END IF + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) +* +* Scale matrix to allowable range, if necessary. +* + ISCALE = 0 + ABSTLL = ABSTOL + IF( VALEIG ) THEN + VLL = VL + VUU = VU + ELSE + VLL = ZERO + VUU = ZERO + END IF + ANRM = SLANSB( 'M', UPLO, N, KD, AB, LDAB, WORK ) + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + CALL SLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + ELSE + CALL SLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + END IF + IF( ABSTOL.GT.0 ) + $ ABSTLL = ABSTOL*SIGMA + IF( VALEIG ) THEN + VLL = VL*SIGMA + VUU = VU*SIGMA + END IF + END IF +* +* Call SSYTRD_SB2ST to reduce symmetric band matrix to tridiagonal form. +* + INDD = 1 + INDE = INDD + N + INDHOUS = INDE + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, WORK( INDD ), + $ WORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* If all eigenvalues are desired and ABSTOL is less than or equal +* to zero, then call SSTERF or SSTEQR. If this fails for some +* eigenvalue, then try SSTEBZ. +* + TEST = .FALSE. + IF (INDEIG) THEN + IF (IL.EQ.1 .AND. IU.EQ.N) THEN + TEST = .TRUE. + END IF + END IF + IF ((ALLEIG .OR. TEST) .AND. (ABSTOL.LE.ZERO)) THEN + CALL SCOPY( N, WORK( INDD ), 1, W, 1 ) + INDEE = INDWRK + 2*N + IF( .NOT.WANTZ ) THEN + CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 ) + CALL SSTERF( N, W, WORK( INDEE ), INFO ) + ELSE + CALL SLACPY( 'A', N, N, Q, LDQ, Z, LDZ ) + CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 ) + CALL SSTEQR( JOBZ, N, W, WORK( INDEE ), Z, LDZ, + $ WORK( INDWRK ), INFO ) + IF( INFO.EQ.0 ) THEN + DO 10 I = 1, N + IFAIL( I ) = 0 + 10 CONTINUE + END IF + END IF + IF( INFO.EQ.0 ) THEN + M = N + GO TO 30 + END IF + INFO = 0 + END IF +* +* Otherwise, call SSTEBZ and, if eigenvectors are desired, SSTEIN. +* + IF( WANTZ ) THEN + ORDER = 'B' + ELSE + ORDER = 'E' + END IF + INDIBL = 1 + INDISP = INDIBL + N + INDIWO = INDISP + N + CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, + $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W, + $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ), + $ IWORK( INDIWO ), INFO ) +* + IF( WANTZ ) THEN + CALL SSTEIN( N, WORK( INDD ), WORK( INDE ), M, W, + $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO ) +* +* Apply orthogonal matrix used in reduction to tridiagonal +* form to eigenvectors returned by SSTEIN. +* + DO 20 J = 1, M + CALL SCOPY( N, Z( 1, J ), 1, WORK( 1 ), 1 ) + CALL SGEMV( 'N', N, N, ONE, Q, LDQ, WORK, 1, ZERO, + $ Z( 1, J ), 1 ) + 20 CONTINUE + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + 30 CONTINUE + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = M + ELSE + IMAX = INFO - 1 + END IF + CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* If eigenvalues are not in order, then sort them, along with +* eigenvectors. +* + IF( WANTZ ) THEN + DO 50 J = 1, M - 1 + I = 0 + TMP1 = W( J ) + DO 40 JJ = J + 1, M + IF( W( JJ ).LT.TMP1 ) THEN + I = JJ + TMP1 = W( JJ ) + END IF + 40 CONTINUE +* + IF( I.NE.0 ) THEN + ITMP1 = IWORK( INDIBL+I-1 ) + W( I ) = W( J ) + IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + W( J ) = TMP1 + IWORK( INDIBL+J-1 ) = ITMP1 + CALL SSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) + IF( INFO.NE.0 ) THEN + ITMP1 = IFAIL( I ) + IFAIL( I ) = IFAIL( J ) + IFAIL( J ) = ITMP1 + END IF + END IF + 50 CONTINUE + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of SSBEVX_2STAGE +* + END diff --git a/SRC/ssyev_2stage.f b/SRC/ssyev_2stage.f new file mode 100644 index 00000000..52f11c35 --- /dev/null +++ b/SRC/ssyev_2stage.f @@ -0,0 +1,348 @@ +*> \brief <b> SSYEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b> +* +* @generated from dsyev_2stage.f, fortran d -> s, Sat Nov 5 23:55:51 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SSYEV_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssyevd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssyevd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssyevd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), W( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSYEV_2STAGE computes all eigenvalues and, optionally, eigenvectors of a +*> real symmetric matrix A using the 2stage technique for +*> the reduction to tridiagonal. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA, N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the +*> orthonormal eigenvectors of the matrix A. +*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') +*> or the upper triangle (if UPLO='U') of A, including the +*> diagonal, is destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension LWORK +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + 2*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + 2*N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the algorithm failed to converge; i +*> off-diagonal elements of an intermediate tridiagonal +*> form did not converge to zero. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup realSYeigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + $ INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ), W( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, LQUERY, WANTZ + INTEGER IINFO, IMAX, INDE, INDTAU, INDWRK, ISCALE, + $ LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS + REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANSY + EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY +* .. +* .. External Subroutines .. + EXTERNAL SLASCL, SORGTR, SSCAL, SSTEQR, SSTERF, + $ XERBLA, SSYTRD_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF +* + IF( INFO.EQ.0 ) THEN + KD = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = 2*N + LHTRD + LWTRD + WORK( 1 ) = LWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -8 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSYEV_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) THEN + RETURN + END IF +* + IF( N.EQ.1 ) THEN + W( 1 ) = A( 1, 1 ) + WORK( 1 ) = 2 + IF( WANTZ ) + $ A( 1, 1 ) = ONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = SLANSY( 'M', UPLO, N, A, LDA, WORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) + $ CALL SLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO ) +* +* Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. +* + INDE = 1 + INDTAU = INDE + N + INDHOUS = INDTAU + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ), + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call SSTERF. For eigenvectors, first call +* SORGTR to generate the orthogonal matrix, then call SSTEQR. +* + IF( .NOT.WANTZ ) THEN + CALL SSTERF( N, W, WORK( INDE ), INFO ) + ELSE +* Not available in this release, and agrument checking should not +* let it getting here + RETURN + CALL SORGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ), + $ LLWORK, IINFO ) + CALL SSTEQR( JOBZ, N, W, WORK( INDE ), A, LDA, WORK( INDTAU ), + $ INFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = N + ELSE + IMAX = INFO - 1 + END IF + CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of SSYEV_2STAGE +* + END diff --git a/SRC/ssyevd_2stage.f b/SRC/ssyevd_2stage.f new file mode 100644 index 00000000..8510b645 --- /dev/null +++ b/SRC/ssyevd_2stage.f @@ -0,0 +1,406 @@ +*> \brief <b> SSYEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b> +* +* @generated from dsyevd_2stage.f, fortran d -> s, Sat Nov 5 23:55:54 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SSYEVD_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssyevd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssyevd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssyevd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SSYEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* IWORK, LIWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, LDA, LIWORK, LWORK, N +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ) +* REAL A( LDA, * ), W( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSYEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a +*> real symmetric matrix A using the 2stage technique for +*> the reduction to tridiagonal. If eigenvectors are desired, it uses a +*> divide and conquer algorithm. +*> +*> The divide and conquer algorithm makes very mild assumptions about +*> floating point arithmetic. It will work on machines with a guard +*> digit in add/subtract, or on those binary machines without guard +*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or +*> Cray-2. It could conceivably fail on hexadecimal or decimal machines +*> without guard digits, but we know of none. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA, N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the +*> orthonormal eigenvectors of the matrix A. +*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') +*> or the upper triangle (if UPLO='U') of A, including the +*> diagonal, is destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, +*> dimension (LWORK) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + 2*N+1 +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + 2*N+1 +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be at least +*> 1 + 6*N + 2*N**2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK and IWORK +*> arrays, returns these values as the first entries of the WORK +*> and IWORK arrays, and no error message related to LWORK or +*> LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK must be at least 1. +*> If JOBZ = 'N' and N > 1, LIWORK must be at least 1. +*> If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N. +*> +*> If LIWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK and IWORK arrays, and no error message related to +*> LWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i and JOBZ = 'N', then the algorithm failed +*> to converge; i off-diagonal elements of an intermediate +*> tridiagonal form did not converge to zero; +*> if INFO = i and JOBZ = 'V', then the algorithm failed +*> to compute an eigenvalue while working on the submatrix +*> lying in rows and columns INFO/(N+1) through +*> mod(INFO,N+1). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date September 2012 +* +*> \ingroup realSYeigen +* +*> \par Contributors: +* ================== +*> +*> Jeff Rutter, Computer Science Division, University of California +*> at Berkeley, USA \n +*> Modified by Francoise Tisseur, University of Tennessee \n +*> Modified description of INFO. Sven, 16 Feb 05. \n +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SSYEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + $ IWORK, LIWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.4.2) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* September 2012 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, LDA, LIWORK, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + REAL A( LDA, * ), W( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. +* + LOGICAL LOWER, LQUERY, WANTZ + INTEGER IINFO, INDE, INDTAU, INDWK2, INDWRK, ISCALE, + $ LIWMIN, LLWORK, LLWRK2, LWMIN, + $ LHTRD, LWTRD, KD, IB, INDHOUS + REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANSY + EXTERNAL LSAME, SLAMCH, SLANSY, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SLACPY, SLASCL, SORMTR, SSCAL, SSTEDC, SSTERF, + $ SSYTRD_2STAGE, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LIWMIN = 1 + LWMIN = 1 + ELSE + KD = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + IF( WANTZ ) THEN + LIWMIN = 3 + 5*N + LWMIN = 1 + 6*N + 2*N**2 + ELSE + LIWMIN = 1 + LWMIN = 2*N + 1 + LHTRD + LWTRD + END IF + END IF + WORK( 1 ) = LWMIN + IWORK( 1 ) = LIWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -8 + ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSYEVD_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + W( 1 ) = A( 1, 1 ) + IF( WANTZ ) + $ A( 1, 1 ) = ONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = SLANSY( 'M', UPLO, N, A, LDA, WORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) + $ CALL SLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO ) +* +* Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. +* + INDE = 1 + INDTAU = INDE + N + INDHOUS = INDTAU + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 + INDWK2 = INDWRK + N*N + LLWRK2 = LWORK - INDWK2 + 1 +* + CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ), + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call SSTERF. For eigenvectors, first call +* SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the +* tridiagonal matrix, then call SORMTR to multiply it by the +* Householder transformations stored in A. +* + IF( .NOT.WANTZ ) THEN + CALL SSTERF( N, W, WORK( INDE ), INFO ) + ELSE +* Not available in this release, and agrument checking should not +* let it getting here + RETURN + CALL SSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N, + $ WORK( INDWK2 ), LLWRK2, IWORK, LIWORK, INFO ) + CALL SORMTR( 'L', UPLO, 'N', N, N, A, LDA, WORK( INDTAU ), + $ WORK( INDWRK ), N, WORK( INDWK2 ), LLWRK2, IINFO ) + CALL SLACPY( 'A', N, N, WORK( INDWRK ), N, A, LDA ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) + $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) +* + WORK( 1 ) = LWMIN + IWORK( 1 ) = LIWMIN +* + RETURN +* +* End of SSYEVD_2STAGE +* + END diff --git a/SRC/ssyevr_2stage.f b/SRC/ssyevr_2stage.f new file mode 100644 index 00000000..27b99303 --- /dev/null +++ b/SRC/ssyevr_2stage.f @@ -0,0 +1,745 @@ +*> \brief <b> SSYEVR_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b> +* +* @generated from dsyevr_2stage.f, fortran d -> s, Sat Nov 5 23:50:10 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SSYEVR_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssyevr_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssyevr_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssyevr_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SSYEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, +* IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, WORK, +* LWORK, IWORK, LIWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, RANGE, UPLO +* INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LWORK, M, N +* REAL ABSTOL, VL, VU +* .. +* .. Array Arguments .. +* INTEGER ISUPPZ( * ), IWORK( * ) +* REAL A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSYEVR_2STAGE computes selected eigenvalues and, optionally, eigenvectors +*> of a real symmetric matrix A using the 2stage technique for +*> the reduction to tridiagonal. Eigenvalues and eigenvectors can be +*> selected by specifying either a range of values or a range of +*> indices for the desired eigenvalues. +*> +*> SSYEVR_2STAGE first reduces the matrix A to tridiagonal form T with a call +*> to SSYTRD. Then, whenever possible, SSYEVR_2STAGE calls SSTEMR to compute +*> the eigenspectrum using Relatively Robust Representations. SSTEMR +*> computes eigenvalues by the dqds algorithm, while orthogonal +*> eigenvectors are computed from various "good" L D L^T representations +*> (also known as Relatively Robust Representations). Gram-Schmidt +*> orthogonalization is avoided as far as possible. More specifically, +*> the various steps of the algorithm are as follows. +*> +*> For each unreduced block (submatrix) of T, +*> (a) Compute T - sigma I = L D L^T, so that L and D +*> define all the wanted eigenvalues to high relative accuracy. +*> This means that small relative changes in the entries of D and L +*> cause only small relative changes in the eigenvalues and +*> eigenvectors. The standard (unfactored) representation of the +*> tridiagonal matrix T does not have this property in general. +*> (b) Compute the eigenvalues to suitable accuracy. +*> If the eigenvectors are desired, the algorithm attains full +*> accuracy of the computed eigenvalues only right before +*> the corresponding vectors have to be computed, see steps c) and d). +*> (c) For each cluster of close eigenvalues, select a new +*> shift close to the cluster, find a new factorization, and refine +*> the shifted eigenvalues to suitable accuracy. +*> (d) For each eigenvalue with a large enough relative separation compute +*> the corresponding eigenvector by forming a rank revealing twisted +*> factorization. Go back to (c) for any clusters that remain. +*> +*> The desired accuracy of the output can be specified by the input +*> parameter ABSTOL. +*> +*> For more details, see SSTEMR's documentation and: +*> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations +*> to compute orthogonal eigenvectors of symmetric tridiagonal matrices," +*> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004. +*> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and +*> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25, +*> 2004. Also LAPACK Working Note 154. +*> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric +*> tridiagonal eigenvalue/eigenvector problem", +*> Computer Science Division Technical Report No. UCB/CSD-97-971, +*> UC Berkeley, May 1997. +*> +*> +*> Note 1 : SSYEVR_2STAGE calls SSTEMR when the full spectrum is requested +*> on machines which conform to the ieee-754 floating point standard. +*> SSYEVR_2STAGE calls SSTEBZ and SSTEIN on non-ieee machines and +*> when partial spectrum requests are made. +*> +*> Normal execution of SSTEMR may create NaNs and infinities and +*> hence may abort due to a floating point exception in environments +*> which do not handle NaNs and infinities in the ieee standard default +*> manner. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] RANGE +*> \verbatim +*> RANGE is CHARACTER*1 +*> = 'A': all eigenvalues will be found. +*> = 'V': all eigenvalues in the half-open interval (VL,VU] +*> will be found. +*> = 'I': the IL-th through IU-th eigenvalues will be found. +*> For RANGE = 'V' or 'I' and IU - IL < N - 1, SSTEBZ and +*> SSTEIN are called +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA, N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, the lower triangle (if UPLO='L') or the upper +*> triangle (if UPLO='U') of A, including the diagonal, is +*> destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in] VL +*> \verbatim +*> VL is REAL +*> If RANGE='V', the lower bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] VU +*> \verbatim +*> VU is REAL +*> If RANGE='V', the upper bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] IL +*> \verbatim +*> IL is INTEGER +*> If RANGE='I', the index of the +*> smallest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] IU +*> \verbatim +*> IU is INTEGER +*> If RANGE='I', the index of the +*> largest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL +*> The absolute error tolerance for the eigenvalues. +*> An approximate eigenvalue is accepted as converged +*> when it is determined to lie in an interval [a,b] +*> of width less than or equal to +*> +*> ABSTOL + EPS * max( |a|,|b| ) , +*> +*> where EPS is the machine precision. If ABSTOL is less than +*> or equal to zero, then EPS*|T| will be used in its place, +*> where |T| is the 1-norm of the tridiagonal matrix obtained +*> by reducing A to tridiagonal form. +*> +*> See "Computing Small Singular Values of Bidiagonal Matrices +*> with Guaranteed High Relative Accuracy," by Demmel and +*> Kahan, LAPACK Working Note #3. +*> +*> If high relative accuracy is important, set ABSTOL to +*> SLAMCH( 'Safe minimum' ). Doing so will guarantee that +*> eigenvalues are computed to high relative accuracy when +*> possible in future releases. The current code does not +*> make any guarantees about high relative accuracy, but +*> future releases will. See J. Barlow and J. Demmel, +*> "Computing Accurate Eigensystems of Scaled Diagonally +*> Dominant Matrices", LAPACK Working Note #7, for a discussion +*> of which matrices define their eigenvalues to high relative +*> accuracy. +*> \endverbatim +*> +*> \param[out] M +*> \verbatim +*> M is INTEGER +*> The total number of eigenvalues found. 0 <= M <= N. +*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> The first M elements contain the selected eigenvalues in +*> ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is REAL array, dimension (LDZ, max(1,M)) +*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z +*> contain the orthonormal eigenvectors of the matrix A +*> corresponding to the selected eigenvalues, with the i-th +*> column of Z holding the eigenvector associated with W(i). +*> If JOBZ = 'N', then Z is not referenced. +*> Note: the user must ensure that at least max(1,M) columns are +*> supplied in the array Z; if RANGE = 'V', the exact value of M +*> is not known in advance and an upper bound must be used. +*> Supplying N columns is always safe. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] ISUPPZ +*> \verbatim +*> ISUPPZ is INTEGER array, dimension ( 2*max(1,M) ) +*> The support of the eigenvectors in Z, i.e., the indices +*> indicating the nonzero elements in Z. The i-th eigenvector +*> is nonzero only in elements ISUPPZ( 2*i-1 ) through +*> ISUPPZ( 2*i ). This is an output of SSTEMR (tridiagonal +*> matrix). The support of the eigenvectors of A is typically +*> 1:N because of the orthogonal transformations applied by SORMTR. +*> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, 26*N, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + 5*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + 5*N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> +*> If LIWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal size of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: Internal error +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup realSYeigen +* +*> \par Contributors: +* ================== +*> +*> Inderjit Dhillon, IBM Almaden, USA \n +*> Osni Marques, LBNL/NERSC, USA \n +*> Ken Stanley, Computer Science Division, University of +*> California at Berkeley, USA \n +*> Jason Riedy, Computer Science Division, University of +*> California at Berkeley, USA \n +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SSYEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, + $ IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, WORK, + $ LWORK, IWORK, LIWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, RANGE, UPLO + INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LWORK, M, N + REAL ABSTOL, VL, VU +* .. +* .. Array Arguments .. + INTEGER ISUPPZ( * ), IWORK( * ) + REAL A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, VALEIG, WANTZ, + $ TRYRAC, TEST + CHARACTER ORDER + INTEGER I, IEEEOK, IINFO, IMAX, INDD, INDDD, INDE, + $ INDEE, INDIBL, INDIFL, INDISP, INDIWO, INDTAU, + $ INDWK, INDWKN, ISCALE, J, JJ, LIWMIN, + $ LLWORK, LLWRKN, LWMIN, NSPLIT, + $ LHTRD, LWTRD, KD, IB, INDHOUS + REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, + $ SIGMA, SMLNUM, TMP1, VLL, VUU +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANSY + EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SORMTR, SSCAL, SSTEBZ, SSTEMR, SSTEIN, + $ SSTERF, SSWAP, SSYTRD_2STAGE, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IEEEOK = ILAENV( 10, 'SSYEVR', 'N', 1, 2, 3, 4 ) +* + LOWER = LSAME( UPLO, 'L' ) + WANTZ = LSAME( JOBZ, 'V' ) + ALLEIG = LSAME( RANGE, 'A' ) + VALEIG = LSAME( RANGE, 'V' ) + INDEIG = LSAME( RANGE, 'I' ) +* + LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) ) +* + KD = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) + LIWMIN = MAX( 1, 10*N ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN + INFO = -2 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE + IF( VALEIG ) THEN + IF( N.GT.0 .AND. VU.LE.VL ) + $ INFO = -8 + ELSE IF( INDEIG ) THEN + IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN + INFO = -10 + END IF + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -15 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -18 + ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN + INFO = -20 + END IF + END IF +* + IF( INFO.EQ.0 ) THEN +* NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) +* NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) +* LWKOPT = MAX( ( NB+1 )*N, LWMIN ) + WORK( 1 ) = LWMIN + IWORK( 1 ) = LIWMIN + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSYEVR_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + M = 0 + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* + IF( N.EQ.1 ) THEN + WORK( 1 ) = 26 + IF( ALLEIG .OR. INDEIG ) THEN + M = 1 + W( 1 ) = A( 1, 1 ) + ELSE + IF( VL.LT.A( 1, 1 ) .AND. VU.GE.A( 1, 1 ) ) THEN + M = 1 + W( 1 ) = A( 1, 1 ) + END IF + END IF + IF( WANTZ ) THEN + Z( 1, 1 ) = ONE + ISUPPZ( 1 ) = 1 + ISUPPZ( 2 ) = 1 + END IF + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) +* +* Scale matrix to allowable range, if necessary. +* + ISCALE = 0 + ABSTLL = ABSTOL + IF (VALEIG) THEN + VLL = VL + VUU = VU + END IF + ANRM = SLANSY( 'M', UPLO, N, A, LDA, WORK ) + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + DO 10 J = 1, N + CALL SSCAL( N-J+1, SIGMA, A( J, J ), 1 ) + 10 CONTINUE + ELSE + DO 20 J = 1, N + CALL SSCAL( J, SIGMA, A( 1, J ), 1 ) + 20 CONTINUE + END IF + IF( ABSTOL.GT.0 ) + $ ABSTLL = ABSTOL*SIGMA + IF( VALEIG ) THEN + VLL = VL*SIGMA + VUU = VU*SIGMA + END IF + END IF + +* Initialize indices into workspaces. Note: The IWORK indices are +* used only if SSTERF or SSTEMR fail. + +* WORK(INDTAU:INDTAU+N-1) stores the scalar factors of the +* elementary reflectors used in SSYTRD. + INDTAU = 1 +* WORK(INDD:INDD+N-1) stores the tridiagonal's diagonal entries. + INDD = INDTAU + N +* WORK(INDE:INDE+N-1) stores the off-diagonal entries of the +* tridiagonal matrix from SSYTRD. + INDE = INDD + N +* WORK(INDDD:INDDD+N-1) is a copy of the diagonal entries over +* -written by SSTEMR (the SSTERF path copies the diagonal to W). + INDDD = INDE + N +* WORK(INDEE:INDEE+N-1) is a copy of the off-diagonal entries over +* -written while computing the eigenvalues in SSTERF and SSTEMR. + INDEE = INDDD + N +* INDHOUS is the starting offset Householder storage of stage 2 + INDHOUS = INDEE + N +* INDWK is the starting offset of the left-over workspace, and +* LLWORK is the remaining workspace size. + INDWK = INDHOUS + LHTRD + LLWORK = LWORK - INDWK + 1 + + +* IWORK(INDIBL:INDIBL+M-1) corresponds to IBLOCK in SSTEBZ and +* stores the block indices of each of the M<=N eigenvalues. + INDIBL = 1 +* IWORK(INDISP:INDISP+NSPLIT-1) corresponds to ISPLIT in SSTEBZ and +* stores the starting and finishing indices of each block. + INDISP = INDIBL + N +* IWORK(INDIFL:INDIFL+N-1) stores the indices of eigenvectors +* that corresponding to eigenvectors that fail to converge in +* SSTEIN. This information is discarded; if any fail, the driver +* returns INFO > 0. + INDIFL = INDISP + N +* INDIWO is the offset of the remaining integer workspace. + INDIWO = INDIFL + N + +* +* Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. +* +* + CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), + $ LHTRD, WORK( INDWK ), LLWORK, IINFO ) +* +* If all eigenvalues are desired +* then call SSTERF or SSTEMR and SORMTR. +* + TEST = .FALSE. + IF( INDEIG ) THEN + IF( IL.EQ.1 .AND. IU.EQ.N ) THEN + TEST = .TRUE. + END IF + END IF + IF( ( ALLEIG.OR.TEST ) .AND. ( IEEEOK.EQ.1 ) ) THEN + IF( .NOT.WANTZ ) THEN + CALL SCOPY( N, WORK( INDD ), 1, W, 1 ) + CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 ) + CALL SSTERF( N, W, WORK( INDEE ), INFO ) + ELSE + CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 ) + CALL SCOPY( N, WORK( INDD ), 1, WORK( INDDD ), 1 ) +* + IF (ABSTOL .LE. TWO*N*EPS) THEN + TRYRAC = .TRUE. + ELSE + TRYRAC = .FALSE. + END IF + CALL SSTEMR( JOBZ, 'A', N, WORK( INDDD ), WORK( INDEE ), + $ VL, VU, IL, IU, M, W, Z, LDZ, N, ISUPPZ, + $ TRYRAC, WORK( INDWK ), LWORK, IWORK, LIWORK, + $ INFO ) +* +* +* +* Apply orthogonal matrix used in reduction to tridiagonal +* form to eigenvectors returned by SSTEMR. +* + IF( WANTZ .AND. INFO.EQ.0 ) THEN + INDWKN = INDE + LLWRKN = LWORK - INDWKN + 1 + CALL SORMTR( 'L', UPLO, 'N', N, M, A, LDA, + $ WORK( INDTAU ), Z, LDZ, WORK( INDWKN ), + $ LLWRKN, IINFO ) + END IF + END IF +* +* + IF( INFO.EQ.0 ) THEN +* Everything worked. Skip SSTEBZ/SSTEIN. IWORK(:) are +* undefined. + M = N + GO TO 30 + END IF + INFO = 0 + END IF +* +* Otherwise, call SSTEBZ and, if eigenvectors are desired, SSTEIN. +* Also call SSTEBZ and SSTEIN if SSTEMR fails. +* + IF( WANTZ ) THEN + ORDER = 'B' + ELSE + ORDER = 'E' + END IF + + CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, + $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W, + $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWK ), + $ IWORK( INDIWO ), INFO ) +* + IF( WANTZ ) THEN + CALL SSTEIN( N, WORK( INDD ), WORK( INDE ), M, W, + $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ WORK( INDWK ), IWORK( INDIWO ), IWORK( INDIFL ), + $ INFO ) +* +* Apply orthogonal matrix used in reduction to tridiagonal +* form to eigenvectors returned by SSTEIN. +* + INDWKN = INDE + LLWRKN = LWORK - INDWKN + 1 + CALL SORMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z, + $ LDZ, WORK( INDWKN ), LLWRKN, IINFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* +* Jump here if SSTEMR/SSTEIN succeeded. + 30 CONTINUE + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = M + ELSE + IMAX = INFO - 1 + END IF + CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* If eigenvalues are not in order, then sort them, along with +* eigenvectors. Note: We do not sort the IFAIL portion of IWORK. +* It may not be initialized (if SSTEMR/SSTEIN succeeded), and we do +* not return this detailed information to the user. +* + IF( WANTZ ) THEN + DO 50 J = 1, M - 1 + I = 0 + TMP1 = W( J ) + DO 40 JJ = J + 1, M + IF( W( JJ ).LT.TMP1 ) THEN + I = JJ + TMP1 = W( JJ ) + END IF + 40 CONTINUE +* + IF( I.NE.0 ) THEN + W( I ) = W( J ) + W( J ) = TMP1 + CALL SSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) + END IF + 50 CONTINUE + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN + IWORK( 1 ) = LIWMIN +* + RETURN +* +* End of SSYEVR_2STAGE +* + END diff --git a/SRC/ssyevx_2stage.f b/SRC/ssyevx_2stage.f new file mode 100644 index 00000000..96a73ecd --- /dev/null +++ b/SRC/ssyevx_2stage.f @@ -0,0 +1,608 @@ +*> \brief <b> SSYEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for SY matrices</b> +* +* @generated from dsyevx_2stage.f, fortran d -> s, Sat Nov 5 23:55:46 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SSYEVX_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssyevx_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssyevx_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssyevx_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SSYEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, +* IL, IU, ABSTOL, M, W, Z, LDZ, WORK, +* LWORK, IWORK, IFAIL, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, RANGE, UPLO +* INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N +* REAL ABSTOL, VL, VU +* .. +* .. Array Arguments .. +* INTEGER IFAIL( * ), IWORK( * ) +* REAL A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSYEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors +*> of a real symmetric matrix A using the 2stage technique for +*> the reduction to tridiagonal. Eigenvalues and eigenvectors can be +*> selected by specifying either a range of values or a range of indices +*> for the desired eigenvalues. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] RANGE +*> \verbatim +*> RANGE is CHARACTER*1 +*> = 'A': all eigenvalues will be found. +*> = 'V': all eigenvalues in the half-open interval (VL,VU] +*> will be found. +*> = 'I': the IL-th through IU-th eigenvalues will be found. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA, N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, the lower triangle (if UPLO='L') or the upper +*> triangle (if UPLO='U') of A, including the diagonal, is +*> destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in] VL +*> \verbatim +*> VL is REAL +*> If RANGE='V', the lower bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] VU +*> \verbatim +*> VU is REAL +*> If RANGE='V', the upper bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] IL +*> \verbatim +*> IL is INTEGER +*> If RANGE='I', the index of the +*> smallest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] IU +*> \verbatim +*> IU is INTEGER +*> If RANGE='I', the index of the +*> largest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL +*> The absolute error tolerance for the eigenvalues. +*> An approximate eigenvalue is accepted as converged +*> when it is determined to lie in an interval [a,b] +*> of width less than or equal to +*> +*> ABSTOL + EPS * max( |a|,|b| ) , +*> +*> where EPS is the machine precision. If ABSTOL is less than +*> or equal to zero, then EPS*|T| will be used in its place, +*> where |T| is the 1-norm of the tridiagonal matrix obtained +*> by reducing A to tridiagonal form. +*> +*> Eigenvalues will be computed most accurately when ABSTOL is +*> set to twice the underflow threshold 2*SLAMCH('S'), not zero. +*> If this routine returns with INFO>0, indicating that some +*> eigenvectors did not converge, try setting ABSTOL to +*> 2*SLAMCH('S'). +*> +*> See "Computing Small Singular Values of Bidiagonal Matrices +*> with Guaranteed High Relative Accuracy," by Demmel and +*> Kahan, LAPACK Working Note #3. +*> \endverbatim +*> +*> \param[out] M +*> \verbatim +*> M is INTEGER +*> The total number of eigenvalues found. 0 <= M <= N. +*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> On normal exit, the first M elements contain the selected +*> eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is REAL array, dimension (LDZ, max(1,M)) +*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z +*> contain the orthonormal eigenvectors of the matrix A +*> corresponding to the selected eigenvalues, with the i-th +*> column of Z holding the eigenvector associated with W(i). +*> If an eigenvector fails to converge, then that column of Z +*> contains the latest approximation to the eigenvector, and the +*> index of the eigenvector is returned in IFAIL. +*> If JOBZ = 'N', then Z is not referenced. +*> Note: the user must ensure that at least max(1,M) columns are +*> supplied in the array Z; if RANGE = 'V', the exact value of M +*> is not known in advance and an upper bound must be used. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, 8*N, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + 3*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + 3*N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (5*N) +*> \endverbatim +*> +*> \param[out] IFAIL +*> \verbatim +*> IFAIL is INTEGER array, dimension (N) +*> If JOBZ = 'V', then if INFO = 0, the first M elements of +*> IFAIL are zero. If INFO > 0, then IFAIL contains the +*> indices of the eigenvectors that failed to converge. +*> If JOBZ = 'N', then IFAIL is not referenced. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, then i eigenvectors failed to converge. +*> Their indices are stored in array IFAIL. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup realSYeigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SSYEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, + $ IL, IU, ABSTOL, M, W, Z, LDZ, WORK, + $ LWORK, IWORK, IFAIL, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, RANGE, UPLO + INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N + REAL ABSTOL, VL, VU +* .. +* .. Array Arguments .. + INTEGER IFAIL( * ), IWORK( * ) + REAL A( LDA, * ), W( * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG, + $ WANTZ + CHARACTER ORDER + INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, + $ INDISP, INDIWO, INDTAU, INDWKN, INDWRK, ISCALE, + $ ITMP1, J, JJ, LLWORK, LLWRKN, + $ NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS + REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, + $ SIGMA, SMLNUM, TMP1, VLL, VUU +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANSY + EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SLACPY, SORGTR, SORMTR, SSCAL, SSTEBZ, + $ SSTEIN, SSTEQR, SSTERF, SSWAP, XERBLA, + $ SSYTRD_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LOWER = LSAME( UPLO, 'L' ) + WANTZ = LSAME( JOBZ, 'V' ) + ALLEIG = LSAME( RANGE, 'A' ) + VALEIG = LSAME( RANGE, 'V' ) + INDEIG = LSAME( RANGE, 'I' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN + INFO = -2 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE + IF( VALEIG ) THEN + IF( N.GT.0 .AND. VU.LE.VL ) + $ INFO = -8 + ELSE IF( INDEIG ) THEN + IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN + INFO = -10 + END IF + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -15 + END IF + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + WORK( 1 ) = LWMIN + ELSE + KD = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = MAX( 8*N, 3*N + LHTRD + LWTRD ) + WORK( 1 ) = LWMIN + END IF +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -17 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSYEVX_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + M = 0 + IF( N.EQ.0 ) THEN + RETURN + END IF +* + IF( N.EQ.1 ) THEN + IF( ALLEIG .OR. INDEIG ) THEN + M = 1 + W( 1 ) = A( 1, 1 ) + ELSE + IF( VL.LT.A( 1, 1 ) .AND. VU.GE.A( 1, 1 ) ) THEN + M = 1 + W( 1 ) = A( 1, 1 ) + END IF + END IF + IF( WANTZ ) + $ Z( 1, 1 ) = ONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = SLAMCH( 'Safe minimum' ) + EPS = SLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) +* +* Scale matrix to allowable range, if necessary. +* + ISCALE = 0 + ABSTLL = ABSTOL + IF( VALEIG ) THEN + VLL = VL + VUU = VU + END IF + ANRM = SLANSY( 'M', UPLO, N, A, LDA, WORK ) + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + DO 10 J = 1, N + CALL SSCAL( N-J+1, SIGMA, A( J, J ), 1 ) + 10 CONTINUE + ELSE + DO 20 J = 1, N + CALL SSCAL( J, SIGMA, A( 1, J ), 1 ) + 20 CONTINUE + END IF + IF( ABSTOL.GT.0 ) + $ ABSTLL = ABSTOL*SIGMA + IF( VALEIG ) THEN + VLL = VL*SIGMA + VUU = VU*SIGMA + END IF + END IF +* +* Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. +* + INDTAU = 1 + INDE = INDTAU + N + INDD = INDE + N + INDHOUS = INDD + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), + $ LHTRD, WORK( INDWRK ), LLWORK, IINFO ) +* +* If all eigenvalues are desired and ABSTOL is less than or equal to +* zero, then call SSTERF or SORGTR and SSTEQR. If this fails for +* some eigenvalue, then try SSTEBZ. +* + TEST = .FALSE. + IF( INDEIG ) THEN + IF( IL.EQ.1 .AND. IU.EQ.N ) THEN + TEST = .TRUE. + END IF + END IF + IF( ( ALLEIG .OR. TEST ) .AND. ( ABSTOL.LE.ZERO ) ) THEN + CALL SCOPY( N, WORK( INDD ), 1, W, 1 ) + INDEE = INDWRK + 2*N + IF( .NOT.WANTZ ) THEN + CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 ) + CALL SSTERF( N, W, WORK( INDEE ), INFO ) + ELSE + CALL SLACPY( 'A', N, N, A, LDA, Z, LDZ ) + CALL SORGTR( UPLO, N, Z, LDZ, WORK( INDTAU ), + $ WORK( INDWRK ), LLWORK, IINFO ) + CALL SCOPY( N-1, WORK( INDE ), 1, WORK( INDEE ), 1 ) + CALL SSTEQR( JOBZ, N, W, WORK( INDEE ), Z, LDZ, + $ WORK( INDWRK ), INFO ) + IF( INFO.EQ.0 ) THEN + DO 30 I = 1, N + IFAIL( I ) = 0 + 30 CONTINUE + END IF + END IF + IF( INFO.EQ.0 ) THEN + M = N + GO TO 40 + END IF + INFO = 0 + END IF +* +* Otherwise, call SSTEBZ and, if eigenvectors are desired, SSTEIN. +* + IF( WANTZ ) THEN + ORDER = 'B' + ELSE + ORDER = 'E' + END IF + INDIBL = 1 + INDISP = INDIBL + N + INDIWO = INDISP + N + CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, + $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W, + $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ), + $ IWORK( INDIWO ), INFO ) +* + IF( WANTZ ) THEN + CALL SSTEIN( N, WORK( INDD ), WORK( INDE ), M, W, + $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO ) +* +* Apply orthogonal matrix used in reduction to tridiagonal +* form to eigenvectors returned by SSTEIN. +* + INDWKN = INDE + LLWRKN = LWORK - INDWKN + 1 + CALL SORMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z, + $ LDZ, WORK( INDWKN ), LLWRKN, IINFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + 40 CONTINUE + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = M + ELSE + IMAX = INFO - 1 + END IF + CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* If eigenvalues are not in order, then sort them, along with +* eigenvectors. +* + IF( WANTZ ) THEN + DO 60 J = 1, M - 1 + I = 0 + TMP1 = W( J ) + DO 50 JJ = J + 1, M + IF( W( JJ ).LT.TMP1 ) THEN + I = JJ + TMP1 = W( JJ ) + END IF + 50 CONTINUE +* + IF( I.NE.0 ) THEN + ITMP1 = IWORK( INDIBL+I-1 ) + W( I ) = W( J ) + IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + W( J ) = TMP1 + IWORK( INDIBL+J-1 ) = ITMP1 + CALL SSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) + IF( INFO.NE.0 ) THEN + ITMP1 = IFAIL( I ) + IFAIL( I ) = IFAIL( J ) + IFAIL( J ) = ITMP1 + END IF + END IF + 60 CONTINUE + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of SSYEVX_2STAGE +* + END diff --git a/SRC/ssygv_2stage.f b/SRC/ssygv_2stage.f new file mode 100644 index 00000000..6eb172e9 --- /dev/null +++ b/SRC/ssygv_2stage.f @@ -0,0 +1,371 @@ +*> \brief \b SSYGV_2STAGE +* +* @generated from dsygv_2stage.f, fortran d -> s, Sun Nov 6 12:54:29 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SSYGV_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssygv_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssygv_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssygv_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SSYGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, +* WORK, LWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, ITYPE, LDA, LDB, LWORK, N +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), B( LDB, * ), W( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSYGV_2STAGE computes all the eigenvalues, and optionally, the eigenvectors +*> of a real generalized symmetric-definite eigenproblem, of the form +*> A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. +*> Here A and B are assumed to be symmetric and B is also +*> positive definite. +*> This routine use the 2stage technique for the reduction to tridiagonal +*> which showed higher performance on recent architecture and for large +* sizes N>2000. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] ITYPE +*> \verbatim +*> ITYPE is INTEGER +*> Specifies the problem type to be solved: +*> = 1: A*x = (lambda)*B*x +*> = 2: A*B*x = (lambda)*x +*> = 3: B*A*x = (lambda)*x +*> \endverbatim +*> +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangles of A and B are stored; +*> = 'L': Lower triangles of A and B are stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrices A and B. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA, N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> +*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the +*> matrix Z of eigenvectors. The eigenvectors are normalized +*> as follows: +*> if ITYPE = 1 or 2, Z**T*B*Z = I; +*> if ITYPE = 3, Z**T*inv(B)*Z = I. +*> If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') +*> or the lower triangle (if UPLO='L') of A, including the +*> diagonal, is destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is REAL array, dimension (LDB, N) +*> On entry, the symmetric positive definite matrix B. +*> If UPLO = 'U', the leading N-by-N upper triangular part of B +*> contains the upper triangular part of the matrix B. +*> If UPLO = 'L', the leading N-by-N lower triangular part of B +*> contains the lower triangular part of the matrix B. +*> +*> On exit, if INFO <= N, the part of B containing the matrix is +*> overwritten by the triangular factor U or L from the Cholesky +*> factorization B = U**T*U or B = L*L**T. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is REAL array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + 2*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + 2*N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: SPOTRF or SSYEV returned an error code: +*> <= N: if INFO = i, SSYEV failed to converge; +*> i off-diagonal elements of an intermediate +*> tridiagonal form did not converge to zero; +*> > N: if INFO = N + i, for 1 <= i <= N, then the leading +*> minor of order i of B is not positive definite. +*> The factorization of B could not be completed and +*> no eigenvalues or eigenvectors were computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup realSYeigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SSYGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, + $ WORK, LWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, ITYPE, LDA, LDB, LWORK, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), W( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, UPPER, WANTZ + CHARACTER TRANS + INTEGER NEIG, + $ LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SPOTRF, SSYGST, STRMM, STRSM, XERBLA, + $ SSYEV_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( ITYPE.LT.1 .OR. ITYPE.GT.3 ) THEN + INFO = -1 + ELSE IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -2 + ELSE IF( .NOT.( UPPER .OR. LSAME( UPLO, 'L' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -8 + END IF +* + IF( INFO.EQ.0 ) THEN + KD = ILAENV( 17, 'SSYTRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = 2*N + LHTRD + LWTRD + WORK( 1 ) = LWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -11 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSYGV_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Form a Cholesky factorization of B. +* + CALL SPOTRF( UPLO, N, B, LDB, INFO ) + IF( INFO.NE.0 ) THEN + INFO = N + INFO + RETURN + END IF +* +* Transform problem to standard eigenvalue problem and solve. +* + CALL SSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) + CALL SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, INFO ) +* + IF( WANTZ ) THEN +* +* Backtransform eigenvectors to the original problem. +* + NEIG = N + IF( INFO.GT.0 ) + $ NEIG = INFO - 1 + IF( ITYPE.EQ.1 .OR. ITYPE.EQ.2 ) THEN +* +* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; +* backtransform eigenvectors: x = inv(L)**T*y or inv(U)*y +* + IF( UPPER ) THEN + TRANS = 'N' + ELSE + TRANS = 'T' + END IF +* + CALL STRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, + $ B, LDB, A, LDA ) +* + ELSE IF( ITYPE.EQ.3 ) THEN +* +* For B*A*x=(lambda)*x; +* backtransform eigenvectors: x = L*y or U**T*y +* + IF( UPPER ) THEN + TRANS = 'T' + ELSE + TRANS = 'N' + END IF +* + CALL STRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, + $ B, LDB, A, LDA ) + END IF + END IF +* + WORK( 1 ) = LWMIN + RETURN +* +* End of SSYGV_2STAGE +* + END diff --git a/SRC/ssytrd_2stage.f b/SRC/ssytrd_2stage.f new file mode 100644 index 00000000..fba3dd45 --- /dev/null +++ b/SRC/ssytrd_2stage.f @@ -0,0 +1,337 @@ +*> \brief \b SSYTRD_2STAGE +* +* @generated from zhetrd_2stage.f, fortran z -> s, Sun Nov 6 19:34:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SSYTRD_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* HOUS2, LHOUS2, WORK, LWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER VECT, UPLO +* INTEGER N, LDA, LWORK, LHOUS2, INFO +* .. +* .. Array Arguments .. +* REAL D( * ), E( * ) +* REAL A( LDA, * ), TAU( * ), +* HOUS2( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSYTRD_2STAGE reduces a real symmetric matrix A to real symmetric +*> tridiagonal form T by a orthogonal similarity transformation: +*> Q1**T Q2**T* A * Q2 * Q1 = T. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] VECT +*> \verbatim +*> VECT is CHARACTER*1 +*> = 'N': No need for the Housholder representation, +*> in particular for the second stage (Band to +*> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, +*> then LHOUS2 is to be queried and computed. +*> (NOT AVAILABLE IN THIS RELEASE). +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the leading +*> N-by-N upper triangular part of A contains the upper +*> triangular part of the matrix A, and the strictly lower +*> triangular part of A is not referenced. If UPLO = 'L', the +*> leading N-by-N lower triangular part of A contains the lower +*> triangular part of the matrix A, and the strictly upper +*> triangular part of A is not referenced. +*> On exit, if UPLO = 'U', the band superdiagonal +*> of A are overwritten by the corresponding elements of the +*> internal band-diagonal matrix AB, and the elements above +*> the KD superdiagonal, with the array TAU, represent the orthogonal +*> matrix Q1 as a product of elementary reflectors; if UPLO +*> = 'L', the diagonal and band subdiagonal of A are over- +*> written by the corresponding elements of the internal band-diagonal +*> matrix AB, and the elements below the KD subdiagonal, with +*> the array TAU, represent the orthogonal matrix Q1 as a product +*> of elementary reflectors. See Further Details. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is REAL array, dimension (N) +*> The diagonal elements of the tridiagonal matrix T. +*> \endverbatim +*> +*> \param[out] E +*> \verbatim +*> E is REAL array, dimension (N-1) +*> The off-diagonal elements of the tridiagonal matrix T. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (N-KD) +*> The scalar factors of the elementary reflectors of +*> the first stage (see Further Details). +*> \endverbatim +*> +*> \param[out] HOUS2 +*> \verbatim +*> HOUS2 is REAL array, dimension LHOUS2, that +*> store the Householder representation of the stage2 +*> band to tridiagonal. +*> \endverbatim +*> +*> \param[in] LHOUS2 +*> \verbatim +*> LHOUS2 is INTEGER +*> The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension) +*> If LWORK = -1, or LHOUS2=-1, +*> then a query is assumed; the routine +*> only calculates the optimal size of the HOUS2 array, returns +*> this value as the first entry of the HOUS2 array, and no error +*> message related to LHOUS2 is issued by XERBLA. +*> LHOUS2 = MAX(1, dimension) where +*> dimension = 4*N if VECT='N' +*> not available now if VECT='H' +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> If LWORK = -1, or LHOUS2=-1, +*> then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup realSYcomputational +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + $ HOUS2, LHOUS2, WORK, LWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER VECT, UPLO + INTEGER N, LDA, LWORK, LHOUS2, INFO +* .. +* .. Array Arguments .. + REAL D( * ), E( * ) + REAL A( LDA, * ), TAU( * ), + $ HOUS2( * ), WORK( * ) +* .. +* +* ===================================================================== +* .. +* .. Local Scalars .. + LOGICAL LQUERY, UPPER, WANTQ + INTEGER KD, IB, LWMIN, LHMIN, LWRK, LDAB, WPOS, ABPOS +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, SSYTRD_SY2SB, SSYTRD_SB2ST +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + WANTQ = LSAME( VECT, 'V' ) + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS2.EQ.-1 ) +* +* Determine the block size, the workspace size and the hous size. +* + KD = ILAENV( 17, 'SSYTRD_2STAGE', VECT, N, -1, -1, -1 ) + IB = ILAENV( 18, 'SSYTRD_2STAGE', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV( 19, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV( 20, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) +* WRITE(*,*),'SSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, +* $ LHMIN, LWMIN +* + IF( .NOT.LSAME( VECT, 'N' ) ) THEN + INFO = -1 + ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LHOUS2.LT.LHMIN .AND. .NOT.LQUERY ) THEN + INFO = -10 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -12 + END IF +* + IF( INFO.EQ.0 ) THEN + HOUS2( 1 ) = LHMIN + WORK( 1 ) = LWMIN + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSYTRD_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* +* Determine pointer position +* + LDAB = KD+1 + LWRK = LWORK-LDAB*N + ABPOS = 1 + WPOS = ABPOS + LDAB*N + CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + $ TAU, WORK( WPOS ), LWRK, INFO ) + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) + RETURN + END IF + CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, + $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSYTRD_SB2ST', -INFO ) + RETURN + END IF +* +* + HOUS2( 1 ) = LHMIN + WORK( 1 ) = LWMIN + RETURN +* +* End of SSYTRD_2STAGE +* + END diff --git a/SRC/ssytrd_sb2st.F b/SRC/ssytrd_sb2st.F new file mode 100644 index 00000000..edbcf125 --- /dev/null +++ b/SRC/ssytrd_sb2st.F @@ -0,0 +1,603 @@ +*> \brief \b SSBTRD +* +* @generated from zhetrd_hb2st.F, fortran z -> s, Sun Nov 6 19:34:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SSBTRD + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssbtrd.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssbtrd.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssbtrd.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* D, E, HOUS, LHOUS, WORK, LWORK, INFO ) +* +* #define PRECISION_REAL +* +* #if defined(_OPENMP) +* use omp_lib +* #endif +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER STAGE1, UPLO, VECT +* INTEGER N, KD, IB, LDAB, LHOUS, LWORK, INFO +* .. +* .. Array Arguments .. +* REAL D( * ), E( * ) +* REAL AB( LDAB, * ), HOUS( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSBTRD reduces a real symmetric band matrix A to real symmetric +*> tridiagonal form T by a orthogonal similarity transformation: +*> Q**T * A * Q = T. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] STAGE +*> \verbatim +*> STAGE is CHARACTER*1 +*> = 'N': "No": to mention that the stage 1 of the reduction +*> from dense to band using the ssytrd_sy2sb routine +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the ssytrd_sy2sb +*> routine has been called to produce AB (e.g., AB is +*> the output of ssytrd_sy2sb. +*> \endverbatim +*> +*> \param[in] VECT +*> \verbatim +*> VECT is CHARACTER*1 +*> = 'N': No need for the Housholder representation, +*> and thus LHOUS is of size max(1, 4*N); +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, +*> then LHOUS is to be queried and computed. +*> (NOT AVAILABLE IN THIS RELEASE). +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is REAL array, dimension (LDAB,N) +*> On entry, the upper or lower triangle of the symmetric band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> On exit, the diagonal elements of AB are overwritten by the +*> diagonal elements of the tridiagonal matrix T; if KD > 0, the +*> elements on the first superdiagonal (if UPLO = 'U') or the +*> first subdiagonal (if UPLO = 'L') are overwritten by the +*> off-diagonal elements of T; the rest of AB is overwritten by +*> values generated during the reduction. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD+1. +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is REAL array, dimension (N) +*> The diagonal elements of the tridiagonal matrix T. +*> \endverbatim +*> +*> \param[out] E +*> \verbatim +*> E is REAL array, dimension (N-1) +*> The off-diagonal elements of the tridiagonal matrix T: +*> E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'. +*> \endverbatim +*> +*> \param[out] HOUS +*> \verbatim +*> HOUS is REAL array, dimension LHOUS, that +*> store the Householder representation. +*> \endverbatim +*> +*> \param[in] LHOUS +*> \verbatim +*> LHOUS is INTEGER +*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) +*> If LWORK = -1, or LHOUS=-1, +*> then a query is assumed; the routine +*> only calculates the optimal size of the HOUS array, returns +*> this value as the first entry of the HOUS array, and no error +*> message related to LHOUS is issued by XERBLA. +*> LHOUS = MAX(1, dimension) where +*> dimension = 4*N if VECT='N' +*> not available now if VECT='H' +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> If LWORK = -1, or LHOUS=-1, +*> then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup real16OTHERcomputational +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) +* +#define PRECISION_REAL +* +#if defined(_OPENMP) + use omp_lib +#endif +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER STAGE1, UPLO, VECT + INTEGER N, KD, LDAB, LHOUS, LWORK, INFO +* .. +* .. Array Arguments .. + REAL D( * ), E( * ) + REAL AB( LDAB, * ), HOUS( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL RZERO + REAL ZERO, ONE + PARAMETER ( RZERO = 0.0E+0, + $ ZERO = 0.0E+0, + $ ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, + $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, + $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, + $ SISEV, SIZETAU, LDV, LHMIN, LWMIN +#if defined (PRECISION_COMPLEX) + REAL ABSTMP + REAL TMP +#endif +* .. +* .. External Subroutines .. + EXTERNAL SSB2ST_KERNELS, SLACPY, SLASET +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN, MAX, CEILING, REAL +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. Executable Statements .. +* +* Determine the minimal workspace size required. +* Test the input parameters +* + DEBUG = 0 + INFO = 0 + AFTERS1 = LSAME( STAGE1, 'Y' ) + WANTQ = LSAME( VECT, 'V' ) + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 ) +* +* Determine the block size, the workspace size and the hous size. +* + IB = ILAENV( 18, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV( 19, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV( 20, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) +* + IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN + INFO = -1 + ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN + INFO = -2 + ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( KD.LT.0 ) THEN + INFO = -5 + ELSE IF( LDAB.LT.(KD+1) ) THEN + INFO = -7 + ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -13 + END IF +* + IF( INFO.EQ.0 ) THEN + HOUS( 1 ) = LHMIN + WORK( 1 ) = LWMIN + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSYTRD_SB2ST', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* +* Determine pointer position +* + LDV = KD + IB + SIZETAU = 2 * N + SISEV = 2 * N + INDTAU = 1 + INDV = INDTAU + SIZETAU + LDA = 2 * KD + 1 + SIZEA = LDA * N + INDA = 1 + INDW = INDA + SIZEA + NTHREADS = 1 + TID = 0 +* + IF( UPPER ) THEN + APOS = INDA + KD + AWPOS = INDA + DPOS = APOS + KD + OFDPOS = DPOS - 1 + ABDPOS = KD + 1 + ABOFDPOS = KD + ELSE + APOS = INDA + AWPOS = INDA + KD + 1 + DPOS = APOS + OFDPOS = DPOS + 1 + ABDPOS = 1 + ABOFDPOS = 2 + + ENDIF +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* real because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or +* in a parallel environment it might need two cores for D and E +* + IF( KD.EQ.0 ) THEN + DO 30 I = 1, N + D( I ) = ( AB( ABDPOS, I ) ) + 30 CONTINUE + DO 40 I = 1, N-1 + E( I ) = RZERO + 40 CONTINUE + GOTO 200 + END IF +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal +* and offdiagonal elements real, and store them in D and E. +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler coversion formula might be used, but then +* updating the Q matrix will be required and based if Q is generated +* or not this might complicate the story. +* +C IF( KD.EQ.1 .AND. N.GT.(KD+1) .AND. AFTERS1 ) THEN + IF( KD.EQ.1 ) THEN + DO 50 I = 1, N + D( I ) = ( AB( ABDPOS, I ) ) + 50 CONTINUE +#if defined (PRECISION_COMPLEX) +* +* make off-diagonal elements real and copy them to E +* + IF( UPPER ) THEN + DO 60 I = 1, N - 1 + TMP = AB( ABOFDPOS, I+1 ) + ABSTMP = ABS( TMP ) + AB( ABOFDPOS, I+1 ) = ABSTMP + E( I ) = ABSTMP + IF( ABSTMP.NE.RZERO ) THEN + TMP = TMP / ABSTMP + ELSE + TMP = ONE + END IF + IF( I.LT.N-1 ) + $ AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP +C IF( WANTZ ) THEN +C CALL SSCAL( N, ( TMP ), Q( 1, I+1 ), 1 ) +C END IF + 60 CONTINUE + ELSE + DO 70 I = 1, N - 1 + TMP = AB( ABOFDPOS, I ) + ABSTMP = ABS( TMP ) + AB( ABOFDPOS, I ) = ABSTMP + E( I ) = ABSTMP + IF( ABSTMP.NE.RZERO ) THEN + TMP = TMP / ABSTMP + ELSE + TMP = ONE + END IF + IF( I.LT.N-1 ) + $ AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP +C IF( WANTQ ) THEN +C CALL SSCAL( N, TMP, Q( 1, I+1 ), 1 ) +C END IF + 70 CONTINUE + ENDIF +#else + IF( UPPER ) THEN + DO 60 I = 1, N-1 + E( I ) = ( AB( ABOFDPOS, I+1 ) ) + 60 CONTINUE + ELSE + DO 70 I = 1, N-1 + E( I ) = ( AB( ABOFDPOS, I ) ) + 70 CONTINUE + ENDIF +#endif + GOTO 200 + END IF +* +* Main code start here. +* Reduce the symmetric band of A to a tridiagonal matrix. +* + THGRSIZ = N + GRSIZ = 1 + SHIFT = 3 + NBTILES = CEILING( REAL(N)/REAL(KD) ) + STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) + THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) +* + CALL SLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) + CALL SLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) +* +* +* openMP parallelisation start here +* +#if defined(_OPENMP) +!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) +!$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) +!$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) +!$OMP$ SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT ) +!$OMP MASTER +#endif +* +* main bulge chasing loop +* + DO 100 THGRID = 1, THGRNB + STT = (THGRID-1)*THGRSIZ+1 + THED = MIN( (STT + THGRSIZ -1), (N-1)) + DO 110 I = STT, N-1 + ED = MIN( I, THED ) + IF( STT.GT.ED ) GOTO 100 + DO 120 M = 1, STEPERCOL + ST = STT + DO 130 SWEEPID = ST, ED + DO 140 K = 1, GRSIZ + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + $ + (M-1)*GRSIZ + K + IF ( MYID.EQ.1 ) THEN + TTYPE = 1 + ELSE + TTYPE = MOD( MYID, 2 ) + 2 + ENDIF + + IF( TTYPE.EQ.2 ) THEN + COLPT = (MYID/2)*KD + SWEEPID + STIND = COLPT-KD+1 + EDIND = MIN(COLPT,N) + BLKLASTIND = COLPT + ELSE + COLPT = ((MYID+1)/2)*KD + SWEEPID + STIND = COLPT-KD+1 + EDIND = MIN(COLPT,N) + IF( ( STIND.GE.EDIND-1 ).AND. + $ ( EDIND.EQ.N ) ) THEN + BLKLASTIND = N + ELSE + BLKLASTIND = 0 + ENDIF + ENDIF +* +* Call the kernel +* +#if defined(_OPENMP) + IF( TTYPE.NE.1 ) THEN +!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) +!$OMP$ DEPEND(in:WORK(MYID-1)) +!$OMP$ DEPEND(out:WORK(MYID)) + TID = OMP_GET_THREAD_NUM() + CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + $ STIND, EDIND, SWEEPID, N, KD, IB, + $ WORK ( INDA ), LDA, + $ HOUS( INDV ), HOUS( INDTAU ), LDV, + $ WORK( INDW + TID*KD ) ) +!$OMP END TASK + ELSE +!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) +!$OMP$ DEPEND(out:WORK(MYID)) + TID = OMP_GET_THREAD_NUM() + CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + $ STIND, EDIND, SWEEPID, N, KD, IB, + $ WORK ( INDA ), LDA, + $ HOUS( INDV ), HOUS( INDTAU ), LDV, + $ WORK( INDW + TID*KD ) ) +!$OMP END TASK + ENDIF +#else + CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + $ STIND, EDIND, SWEEPID, N, KD, IB, + $ WORK ( INDA ), LDA, + $ HOUS( INDV ), HOUS( INDTAU ), LDV, + $ WORK( INDW + TID*KD ) ) +#endif + IF ( BLKLASTIND.GE.(N-1) ) THEN + STT = STT + 1 + GOTO 130 + ENDIF + 140 CONTINUE + 130 CONTINUE + 120 CONTINUE + 110 CONTINUE + 100 CONTINUE +* +#if defined(_OPENMP) +!$OMP END MASTER +!$OMP END PARALLEL +#endif +* +* Copy the diagonal from A to D. Note that D is REAL thus only +* the Real part is needed, the imaginary part should be zero. +* + DO 150 I = 1, N + D( I ) = ( WORK( DPOS+(I-1)*LDA ) ) + 150 CONTINUE +* +* Copy the off diagonal from A to E. Note that E is REAL thus only +* the Real part is needed, the imaginary part should be zero. +* + IF( UPPER ) THEN + DO 160 I = 1, N-1 + E( I ) = ( WORK( OFDPOS+I*LDA ) ) + 160 CONTINUE + ELSE + DO 170 I = 1, N-1 + E( I ) = ( WORK( OFDPOS+(I-1)*LDA ) ) + 170 CONTINUE + ENDIF +* + 200 CONTINUE +* + HOUS( 1 ) = LHMIN + WORK( 1 ) = LWMIN + RETURN +* +* End of SSYTRD_SB2ST +* + END +#undef PRECISION_REAL + diff --git a/SRC/ssytrd_sy2sb.f b/SRC/ssytrd_sy2sb.f new file mode 100644 index 00000000..3dbbaf1f --- /dev/null +++ b/SRC/ssytrd_sy2sb.f @@ -0,0 +1,517 @@ +*> \brief \b SSYTRD_SY2SB +* +* @generated from zhetrd_he2hb.f, fortran z -> s, Sun Nov 6 19:34:06 2016 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SSYTRD_SY2SB + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SSYTRD_SY2SB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, +* WORK, LWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER UPLO +* INTEGER INFO, LDA, LDAB, LWORK, N, KD +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), AB( LDAB, * ), +* TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSYTRD_SY2SB reduces a real symmetric matrix A to real symmetric +*> band-diagonal form AB by a orthogonal similarity transformation: +*> Q**T * A * Q = AB. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the reduced matrix if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> The reduced matrix is stored in the array AB. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> On entry, the symmetric matrix A. If UPLO = 'U', the leading +*> N-by-N upper triangular part of A contains the upper +*> triangular part of the matrix A, and the strictly lower +*> triangular part of A is not referenced. If UPLO = 'L', the +*> leading N-by-N lower triangular part of A contains the lower +*> triangular part of the matrix A, and the strictly upper +*> triangular part of A is not referenced. +*> On exit, if UPLO = 'U', the diagonal and first superdiagonal +*> of A are overwritten by the corresponding elements of the +*> tridiagonal matrix T, and the elements above the first +*> superdiagonal, with the array TAU, represent the orthogonal +*> matrix Q as a product of elementary reflectors; if UPLO +*> = 'L', the diagonal and first subdiagonal of A are over- +*> written by the corresponding elements of the tridiagonal +*> matrix T, and the elements below the first subdiagonal, with +*> the array TAU, represent the orthogonal matrix Q as a product +*> of elementary reflectors. See Further Details. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] AB +*> \verbatim +*> AB is REAL array, dimension (LDAB,N) +*> On exit, the upper or lower triangle of the symmetric band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD+1. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (N-KD) +*> The scalar factors of the elementary reflectors (see Further +*> Details). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension LWORK. +*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK(1) returns the size of LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK which should be calculated +* by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> LWORK_QUERY = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD +*> where FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice otherwise +*> putting LWORK=-1 will provide the size of WORK. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup realSYcomputational +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +*> \verbatim +*> +*> If UPLO = 'U', the matrix Q is represented as a product of elementary +*> reflectors +*> +*> Q = H(k)**T . . . H(2)**T H(1)**T, where k = n-kd. +*> +*> Each H(i) has the form +*> +*> H(i) = I - tau * v * v**T +*> +*> where tau is a real scalar, and v is a real vector with +*> v(1:i+kd-1) = 0 and v(i+kd) = 1; conjg(v(i+kd+1:n)) is stored on exit in +*> A(i,i+kd+1:n), and tau in TAU(i). +*> +*> If UPLO = 'L', the matrix Q is represented as a product of elementary +*> reflectors +*> +*> Q = H(1) H(2) . . . H(k), where k = n-kd. +*> +*> Each H(i) has the form +*> +*> H(i) = I - tau * v * v**T +*> +*> where tau is a real scalar, and v is a real vector with +*> v(kd+1:i) = 0 and v(i+kd+1) = 1; v(i+kd+2:n) is stored on exit in +* A(i+kd+2:n,i), and tau in TAU(i). +*> +*> The contents of A on exit are illustrated by the following examples +*> with n = 5: +*> +*> if UPLO = 'U': if UPLO = 'L': +*> +*> ( ab ab/v1 v1 v1 v1 ) ( ab ) +*> ( ab ab/v2 v2 v2 ) ( ab/v1 ab ) +*> ( ab ab/v3 v3 ) ( v1 ab/v2 ab ) +*> ( ab ab/v4 ) ( v1 v2 ab/v3 ab ) +*> ( ab ) ( v1 v2 v3 ab/v4 ab ) +*> +*> where d and e denote diagonal and off-diagonal elements of T, and vi +*> denotes an element of the vector defining H(i). +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SSYTRD_SY2SB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, + $ WORK, LWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, LDAB, LWORK, N, KD +* .. +* .. Array Arguments .. + REAL A( LDA, * ), AB( LDAB, * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL RONE + REAL ZERO, ONE, HALF + PARAMETER ( RONE = 1.0E+0, + $ ZERO = 0.0E+0, + $ ONE = 1.0E+0, + $ HALF = 0.5E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, UPPER + INTEGER I, J, IINFO, LWMIN, PN, PK, LK, + $ LDT, LDW, LDS2, LDS1, + $ LS2, LS1, LW, LT, + $ TPOS, WPOS, S2POS, S1POS +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, SSYR2K, SSYMM, SGEMM, + $ SLARFT, SGELQF, SGEQRF, SLASET +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN, MAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. Executable Statements .. +* +* Determine the minimal workspace size required +* and test the input parameters +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) + LWMIN = ILAENV( 20, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) + + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( KD.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LDAB.LT.MAX( 1, KD+1 ) ) THEN + INFO = -7 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = LWMIN + RETURN + END IF +* +* Quick return if possible +* Copy the upper/lower portion of A into AB +* + IF( N.LE.KD+1 ) THEN + IF( UPPER ) THEN + DO 100 I = 1, N + LK = MIN( KD+1, I ) + CALL SCOPY( LK, A( I-LK+1, I ), 1, + $ AB( KD+1-LK+1, I ), 1 ) + 100 CONTINUE + ELSE + DO 110 I = 1, N + LK = MIN( KD+1, N-I+1 ) + CALL SCOPY( LK, A( I, I ), 1, AB( 1, I ), 1 ) + 110 CONTINUE + ENDIF + WORK( 1 ) = 1 + RETURN + END IF +* +* Determine the pointer position for the workspace +* + LDT = KD + LDS1 = KD + LT = LDT*KD + LW = N*KD + LS1 = LDS1*KD + LS2 = LWMIN - LT - LW - LS1 +* LS2 = N*MAX(KD,FACTOPTNB) + TPOS = 1 + WPOS = TPOS + LT + S1POS = WPOS + LW + S2POS = S1POS + LS1 + IF( UPPER ) THEN + LDW = KD + LDS2 = KD + ELSE + LDW = N + LDS2 = N + ENDIF +* +* +* Set the workspace of the triangular matrix T to zero once such a +* way everytime T is generated the upper/lower portion will be always zero +* + CALL SLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT ) +* + IF( UPPER ) THEN + DO 10 I = 1, N - KD, KD + PN = N-I-KD+1 + PK = MIN( N-I-KD+1, KD ) +* +* Compute the LQ factorization of the current block +* + CALL SGELQF( KD, PN, A( I, I+KD ), LDA, + $ TAU( I ), WORK( S2POS ), LS2, IINFO ) +* +* Copy the upper portion of A into AB +* + DO 20 J = I, I+PK-1 + LK = MIN( KD, N-J ) + 1 + CALL SCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 ) + 20 CONTINUE +* + CALL SLASET( 'Lower', PK, PK, ZERO, ONE, + $ A( I, I+KD ), LDA ) +* +* Form the matrix T +* + CALL SLARFT( 'Forward', 'Rowwise', PN, PK, + $ A( I, I+KD ), LDA, TAU( I ), + $ WORK( TPOS ), LDT ) +* +* Compute W: +* + CALL SGEMM( 'Conjugate', 'No transpose', PK, PN, PK, + $ ONE, WORK( TPOS ), LDT, + $ A( I, I+KD ), LDA, + $ ZERO, WORK( S2POS ), LDS2 ) +* + CALL SSYMM( 'Right', UPLO, PK, PN, + $ ONE, A( I+KD, I+KD ), LDA, + $ WORK( S2POS ), LDS2, + $ ZERO, WORK( WPOS ), LDW ) +* + CALL SGEMM( 'No transpose', 'Conjugate', PK, PK, PN, + $ ONE, WORK( WPOS ), LDW, + $ WORK( S2POS ), LDS2, + $ ZERO, WORK( S1POS ), LDS1 ) +* + CALL SGEMM( 'No transpose', 'No transpose', PK, PN, PK, + $ -HALF, WORK( S1POS ), LDS1, + $ A( I, I+KD ), LDA, + $ ONE, WORK( WPOS ), LDW ) +* +* +* Update the unreduced submatrix A(i+kd:n,i+kd:n), using +* an update of the form: A := A - V'*W - W'*V +* + CALL SSYR2K( UPLO, 'Conjugate', PN, PK, + $ -ONE, A( I, I+KD ), LDA, + $ WORK( WPOS ), LDW, + $ RONE, A( I+KD, I+KD ), LDA ) + 10 CONTINUE +* +* Copy the upper band to AB which is the band storage matrix +* + DO 30 J = N-KD+1, N + LK = MIN(KD, N-J) + 1 + CALL SCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 ) + 30 CONTINUE +* + ELSE +* +* Reduce the lower triangle of A to lower band matrix +* + DO 40 I = 1, N - KD, KD + PN = N-I-KD+1 + PK = MIN( N-I-KD+1, KD ) +* +* Compute the QR factorization of the current block +* + CALL SGEQRF( PN, KD, A( I+KD, I ), LDA, + $ TAU( I ), WORK( S2POS ), LS2, IINFO ) +* +* Copy the upper portion of A into AB +* + DO 50 J = I, I+PK-1 + LK = MIN( KD, N-J ) + 1 + CALL SCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 ) + 50 CONTINUE +* + CALL SLASET( 'Upper', PK, PK, ZERO, ONE, + $ A( I+KD, I ), LDA ) +* +* Form the matrix T +* + CALL SLARFT( 'Forward', 'Columnwise', PN, PK, + $ A( I+KD, I ), LDA, TAU( I ), + $ WORK( TPOS ), LDT ) +* +* Compute W: +* + CALL SGEMM( 'No transpose', 'No transpose', PN, PK, PK, + $ ONE, A( I+KD, I ), LDA, + $ WORK( TPOS ), LDT, + $ ZERO, WORK( S2POS ), LDS2 ) +* + CALL SSYMM( 'Left', UPLO, PN, PK, + $ ONE, A( I+KD, I+KD ), LDA, + $ WORK( S2POS ), LDS2, + $ ZERO, WORK( WPOS ), LDW ) +* + CALL SGEMM( 'Conjugate', 'No transpose', PK, PK, PN, + $ ONE, WORK( S2POS ), LDS2, + $ WORK( WPOS ), LDW, + $ ZERO, WORK( S1POS ), LDS1 ) +* + CALL SGEMM( 'No transpose', 'No transpose', PN, PK, PK, + $ -HALF, A( I+KD, I ), LDA, + $ WORK( S1POS ), LDS1, + $ ONE, WORK( WPOS ), LDW ) +* +* +* Update the unreduced submatrix A(i+kd:n,i+kd:n), using +* an update of the form: A := A - V*W' - W*V' +* + CALL SSYR2K( UPLO, 'No transpose', PN, PK, + $ -ONE, A( I+KD, I ), LDA, + $ WORK( WPOS ), LDW, + $ RONE, A( I+KD, I+KD ), LDA ) +* ================================================================== +* RESTORE A FOR COMPARISON AND CHECKING TO BE REMOVED +* DO 45 J = I, I+PK-1 +* LK = MIN( KD, N-J ) + 1 +* CALL SCOPY( LK, AB( 1, J ), 1, A( J, J ), 1 ) +* 45 CONTINUE +* ================================================================== + 40 CONTINUE +* +* Copy the lower band to AB which is the band storage matrix +* + DO 60 J = N-KD+1, N + LK = MIN(KD, N-J) + 1 + CALL SCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 ) + 60 CONTINUE + + END IF +* + WORK( 1 ) = LWMIN + RETURN +* +* End of SSYTRD_SY2SB +* + END diff --git a/SRC/zhb2st_kernels.f b/SRC/zhb2st_kernels.f new file mode 100644 index 00000000..ab03b303 --- /dev/null +++ b/SRC/zhb2st_kernels.f @@ -0,0 +1,320 @@ +*> \brief \b ZHB2ST_KERNELS +* +* @precisions fortran z -> s d c +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZHB2ST_KERNELS + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhb2st_kernels.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhb2st_kernels.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhb2st_kernels.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE, +* ST, ED, SWEEP, N, NB, IB, +* A, LDA, V, TAU, LDVT, WORK) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER UPLO +* LOGICAL WANTZ +* INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), V( * ), +* TAU( * ), WORK( * ) +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZHB2ST_KERNELS is an internal routine used by the ZHETRD_HB2ST +*> subroutine. +*> \endverbatim +* +* Arguments: +* ========== +* +*> @param[in] n +*> The order of the matrix A. +*> +*> @param[in] nb +*> The size of the band. +*> +*> @param[in, out] A +*> A pointer to the matrix A. +*> +*> @param[in] lda +*> The leading dimension of the matrix A. +*> +*> @param[out] V +*> COMPLEX*16 array, dimension 2*n if eigenvalues only are +*> requested or to be queried for vectors. +*> +*> @param[out] TAU +*> COMPLEX*16 array, dimension (2*n). +*> The scalar factors of the Householder reflectors are stored +*> in this array. +*> +*> @param[in] st +*> internal parameter for indices. +*> +*> @param[in] ed +*> internal parameter for indices. +*> +*> @param[in] sweep +*> internal parameter for indices. +*> +*> @param[in] Vblksiz +*> internal parameter for indices. +*> +*> @param[in] wantz +*> logical which indicate if Eigenvalue are requested or both +*> Eigenvalue/Eigenvectors. +*> +*> @param[in] work +*> Workspace of size nb. +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE, + $ ST, ED, SWEEP, N, NB, IB, + $ A, LDA, V, TAU, LDVT, WORK) +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER UPLO + LOGICAL WANTZ + INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), V( * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), + $ ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, J1, J2, LM, LN, VPOS, TAUPOS, + $ DPOS, OFDPOS, AJETER + COMPLEX*16 CTMP +* .. +* .. External Subroutines .. + EXTERNAL ZLARFG, ZLARFX, ZLARFY +* .. +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MOD +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. +* .. Executable Statements .. +* + AJETER = IB + LDVT + UPPER = LSAME( UPLO, 'U' ) + + IF( UPPER ) THEN + DPOS = 2 * NB + 1 + OFDPOS = 2 * NB + ELSE + DPOS = 1 + OFDPOS = 2 + ENDIF + +* +* Upper case +* + IF( UPPER ) THEN +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ENDIF + GO TO ( 101, 102, 103 ) TTYPE +* + 101 CONTINUE + LM = ED - ST + 1 +* + V( VPOS ) = ONE + DO 10 I = 1, LM-1 + V( VPOS+I ) = DCONJG( A( OFDPOS-I, ST+I ) ) + A( OFDPOS-I, ST+I ) = ZERO + 10 CONTINUE + CTMP = DCONJG( A( OFDPOS, ST ) ) + CALL ZLARFG( LM, CTMP, V( VPOS+1 ), 1, + $ TAU( TAUPOS ) ) + A( OFDPOS, ST ) = CTMP +* + 103 CONTINUE + LM = ED - ST + 1 + CALL ZLARFY( UPLO, LM, V( VPOS ), 1, DCONJG( TAU( TAUPOS ) ), + $ A( DPOS, ST ), LDA-1, WORK) + GOTO 300 +* + 102 CONTINUE + J1 = ED+1 + J2 = MIN( ED+NB, N ) + LN = ED-ST+1 + LM = J2-J1+1 + IF( LM.GT.0) THEN + CALL ZLARFX( 'Left', LN, LM, V( VPOS ), + $ DCONJG( TAU( TAUPOS ) ), A( DPOS-NB, J1 ), + $ LDA-1, WORK) +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ENDIF +* + V( VPOS ) = ONE + DO 30 I = 1, LM-1 + V( VPOS+I ) = DCONJG( A( DPOS-NB-I, J1+I ) ) + A( DPOS-NB-I, J1+I ) = ZERO + 30 CONTINUE + CTMP = DCONJG( A( DPOS-NB, J1 ) ) + CALL ZLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) ) + A( DPOS-NB, J1 ) = CTMP +* + CALL ZLARFX( 'Right', LN-1, LM, V( VPOS ), + $ TAU( TAUPOS ), + $ A( DPOS-NB+1, J1 ), LDA-1, WORK) + ENDIF + GOTO 300 +* +* Lower case +* + ELSE +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + ST + TAUPOS = MOD( SWEEP-1, 2 ) * N + ST + ENDIF + GO TO ( 201, 202, 203 ) TTYPE +* + 201 CONTINUE + LM = ED - ST + 1 +* + V( VPOS ) = ONE + DO 20 I = 1, LM-1 + V( VPOS+I ) = A( OFDPOS+I, ST-1 ) + A( OFDPOS+I, ST-1 ) = ZERO + 20 CONTINUE + CALL ZLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, + $ TAU( TAUPOS ) ) +* + 203 CONTINUE + LM = ED - ST + 1 +* + CALL ZLARFY( UPLO, LM, V( VPOS ), 1, DCONJG( TAU( TAUPOS ) ), + $ A( DPOS, ST ), LDA-1, WORK) + + GOTO 300 +* + 202 CONTINUE + J1 = ED+1 + J2 = MIN( ED+NB, N ) + LN = ED-ST+1 + LM = J2-J1+1 +* + IF( LM.GT.0) THEN + CALL ZLARFX( 'Right', LM, LN, V( VPOS ), + $ TAU( TAUPOS ), A( DPOS+NB, ST ), + $ LDA-1, WORK) +* + IF( WANTZ ) THEN + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ELSE + VPOS = MOD( SWEEP-1, 2 ) * N + J1 + TAUPOS = MOD( SWEEP-1, 2 ) * N + J1 + ENDIF +* + V( VPOS ) = ONE + DO 40 I = 1, LM-1 + V( VPOS+I ) = A( DPOS+NB+I, ST ) + A( DPOS+NB+I, ST ) = ZERO + 40 CONTINUE + CALL ZLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, + $ TAU( TAUPOS ) ) +* + CALL ZLARFX( 'Left', LM, LN-1, V( VPOS ), + $ DCONJG( TAU( TAUPOS ) ), + $ A( DPOS+NB-1, ST+1 ), LDA-1, WORK) + + ENDIF + GOTO 300 + ENDIF + + 300 CONTINUE + RETURN +* +* END OF ZHB2ST_KERNELS +* + END diff --git a/SRC/zhbev_2stage.f b/SRC/zhbev_2stage.f new file mode 100644 index 00000000..f1088b87 --- /dev/null +++ b/SRC/zhbev_2stage.f @@ -0,0 +1,386 @@ +*> \brief <b> ZHBEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b> +* +* @precisions fortran z -> s d c +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZHBEV_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhbev_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhbev_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhbev_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZHBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, +* WORK, LWORK, RWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, KD, LDAB, LDZ, N, LWORK +* .. +* .. Array Arguments .. +* DOUBLE PRECISION RWORK( * ), W( * ) +* COMPLEX*16 AB( LDAB, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZHBEV_2STAGE computes all the eigenvalues and, optionally, eigenvectors of +*> a complex Hermitian band matrix A using the 2stage technique for +*> the reduction to tridiagonal. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is COMPLEX*16 array, dimension (LDAB, N) +*> On entry, the upper or lower triangle of the Hermitian band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> +*> On exit, AB is overwritten by values generated during the +*> reduction to tridiagonal form. If UPLO = 'U', the first +*> superdiagonal and the diagonal of the tridiagonal matrix T +*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L', +*> the diagonal and first subdiagonal of T are returned in the +*> first two rows of AB. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD + 1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is COMPLEX*16 array, dimension (LDZ, N) +*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal +*> eigenvectors of the matrix A, with the i-th column of Z +*> holding the eigenvector associated with W(i). +*> If JOBZ = 'N', then Z is not referenced. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension LWORK +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS +*> where KD is the size of the band. +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK, RWORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (max(1,3*N-2)) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> > 0: if INFO = i, the algorithm failed to converge; i +*> off-diagonal elements of an intermediate tridiagonal +*> form did not converge to zero. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complex16OTHEReigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZHBEV_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, + $ WORK, LWORK, RWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, KD, LDAB, LDZ, N, LWORK +* .. +* .. Array Arguments .. + DOUBLE PRECISION RWORK( * ), W( * ) + COMPLEX*16 AB( LDAB, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, WANTZ, LQUERY + INTEGER IINFO, IMAX, INDE, INDWRK, INDRWK, ISCALE, + $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS + DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANHB + EXTERNAL LSAME, DLAMCH, ZLANHB, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL DSCAL, DSTERF, XERBLA, ZLASCL, ZSTEQR + $ ZHETRD_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KD.LT.0 ) THEN + INFO = -4 + ELSE IF( LDAB.LT.KD+1 ) THEN + INFO = -6 + ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -9 + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + WORK( 1 ) = LWMIN + ELSE + IB = ILAENV( 18, 'ZHETRD_HB2ST', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 ) + LWMIN = LHTRD + LWTRD + WORK( 1 ) = LWMIN + ENDIF +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -11 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHBEV_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + IF( LOWER ) THEN + W( 1 ) = DBLE( AB( 1, 1 ) ) + ELSE + W( 1 ) = DBLE( AB( KD+1, 1 ) ) + END IF + IF( WANTZ ) + $ Z( 1, 1 ) = ONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = ZLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + CALL ZLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + ELSE + CALL ZLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + END IF + END IF +* +* Call ZHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form. +* + INDE = 1 + INDHOUS = 1 + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL ZHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, + $ RWORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call DSTERF. For eigenvectors, call ZSTEQR. +* + IF( .NOT.WANTZ ) THEN + CALL DSTERF( N, W, RWORK( INDE ), INFO ) + ELSE + INDRWK = INDE + N + CALL ZSTEQR( JOBZ, N, W, RWORK( INDE ), Z, LDZ, + $ RWORK( INDRWK ), INFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = N + ELSE + IMAX = INFO - 1 + END IF + CALL DSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of ZHBEV_2STAGE +* + END diff --git a/SRC/zhbevd_2stage.f b/SRC/zhbevd_2stage.f new file mode 100644 index 00000000..e4daae74 --- /dev/null +++ b/SRC/zhbevd_2stage.f @@ -0,0 +1,458 @@ +*> \brief <b> ZHBEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b> +* +* @precisions fortran z -> s d c +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZHBEVD_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhbevd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhbevd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhbevd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZHBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, +* WORK, LWORK, RWORK, LRWORK, IWORK, +* LIWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, KD, LDAB, LDZ, LIWORK, LRWORK, LWORK, N +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ) +* DOUBLE PRECISION RWORK( * ), W( * ) +* COMPLEX*16 AB( LDAB, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZHBEVD_2STAGE computes all the eigenvalues and, optionally, eigenvectors of +*> a complex Hermitian band matrix A using the 2stage technique for +*> the reduction to tridiagonal. If eigenvectors are desired, it +*> uses a divide and conquer algorithm. +*> +*> The divide and conquer algorithm makes very mild assumptions about +*> floating point arithmetic. It will work on machines with a guard +*> digit in add/subtract, or on those binary machines without guard +*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or +*> Cray-2. It could conceivably fail on hexadecimal or decimal machines +*> without guard digits, but we know of none. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is COMPLEX*16 array, dimension (LDAB, N) +*> On entry, the upper or lower triangle of the Hermitian band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> +*> On exit, AB is overwritten by values generated during the +*> reduction to tridiagonal form. If UPLO = 'U', the first +*> superdiagonal and the diagonal of the tridiagonal matrix T +*> are returned in rows KD and KD+1 of AB, and if UPLO = 'L', +*> the diagonal and first subdiagonal of T are returned in the +*> first two rows of AB. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD + 1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is COMPLEX*16 array, dimension (LDZ, N) +*> If JOBZ = 'V', then if INFO = 0, Z contains the orthonormal +*> eigenvectors of the matrix A, with the i-th column of Z +*> holding the eigenvector associated with W(i). +*> If JOBZ = 'N', then Z is not referenced. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS +*> where KD is the size of the band. +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK, RWORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, +*> dimension (LRWORK) +*> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER +*> The dimension of array RWORK. +*> If N <= 1, LRWORK must be at least 1. +*> If JOBZ = 'N' and N > 1, LRWORK must be at least N. +*> If JOBZ = 'V' and N > 1, LRWORK must be at least +*> 1 + 5*N + 2*N**2. +*> +*> If LRWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK, RWORK +*> and IWORK arrays, returns these values as the first entries +*> of the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of array IWORK. +*> If JOBZ = 'N' or N <= 1, LIWORK must be at least 1. +*> If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N . +*> +*> If LIWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK, RWORK +*> and IWORK arrays, returns these values as the first entries +*> of the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> > 0: if INFO = i, the algorithm failed to converge; i +*> off-diagonal elements of an intermediate tridiagonal +*> form did not converge to zero. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complex16OTHEReigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZHBEVD_2STAGE( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, + $ WORK, LWORK, RWORK, LRWORK, IWORK, + $ LIWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, KD, LDAB, LDZ, LIWORK, LRWORK, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + DOUBLE PRECISION RWORK( * ), W( * ) + COMPLEX*16 AB( LDAB, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = ( 0.0D0, 0.0D0 ), + $ CONE = ( 1.0D0, 0.0D0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, LQUERY, WANTZ + INTEGER IINFO, IMAX, INDE, INDWK2, INDRWK, ISCALE, + $ LLWORK, INDWK, LHTRD, LWTRD, IB, INDHOUS, + $ LIWMIN, LLRWK, LLWK2, LRWMIN, LWMIN + DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANHB + EXTERNAL LSAME, DLAMCH, ZLANHB, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL DSCAL, DSTERF, XERBLA, ZGEMM, ZLACPY, + $ ZLASCL, ZSTEDC, ZHETRD_HB2ST +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) +* + INFO = 0 + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + IB = ILAENV( 18, 'ZHETRD_HB2ST', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 ) + IF( WANTZ ) THEN + LWMIN = 2*N**2 + LRWMIN = 1 + 5*N + 2*N**2 + LIWMIN = 3 + 5*N + ELSE + LWMIN = MAX( N, LHTRD + LWTRD ) + LRWMIN = N + LIWMIN = 1 + END IF + END IF + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KD.LT.0 ) THEN + INFO = -4 + ELSE IF( LDAB.LT.KD+1 ) THEN + INFO = -6 + ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -9 + END IF +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN + IWORK( 1 ) = LIWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN + INFO = -13 + ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHBEVD_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + W( 1 ) = DBLE( AB( 1, 1 ) ) + IF( WANTZ ) + $ Z( 1, 1 ) = CONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = ZLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + CALL ZLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + ELSE + CALL ZLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + END IF + END IF +* +* Call ZHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form. +* + INDE = 1 + INDRWK = INDE + N + LLRWK = LRWORK - INDRWK + 1 + INDHOUS = 1 + INDWK = INDHOUS + LHTRD + LLWORK = LWORK - INDWK + 1 + INDWK2 = INDWK + N*N + LLWK2 = LWORK - INDWK2 + 1 +* + CALL ZHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, + $ RWORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWK ), LLWORK, IINFO ) +* +* For eigenvalues only, call DSTERF. For eigenvectors, call ZSTEDC. +* + IF( .NOT.WANTZ ) THEN + CALL DSTERF( N, W, RWORK( INDE ), INFO ) + ELSE + CALL ZSTEDC( 'I', N, W, RWORK( INDE ), WORK, N, WORK( INDWK2 ), + $ LLWK2, RWORK( INDRWK ), LLRWK, IWORK, LIWORK, + $ INFO ) + CALL ZGEMM( 'N', 'N', N, N, N, CONE, Z, LDZ, WORK, N, CZERO, + $ WORK( INDWK2 ), N ) + CALL ZLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = N + ELSE + IMAX = INFO - 1 + END IF + CALL DSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* + WORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN + IWORK( 1 ) = LIWMIN + RETURN +* +* End of ZHBEVD_2STAGE +* + END diff --git a/SRC/zhbevx_2stage.f b/SRC/zhbevx_2stage.f new file mode 100644 index 00000000..3efdcc74 --- /dev/null +++ b/SRC/zhbevx_2stage.f @@ -0,0 +1,646 @@ +*> \brief <b> ZHBEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for OTHER matrices</b> +* +* @precisions fortran z -> s d c +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZHBEVX_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhbevx_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhbevx_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhbevx_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, +* Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W, +* Z, LDZ, WORK, LWORK, RWORK, IWORK, +* IFAIL, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, RANGE, UPLO +* INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK +* DOUBLE PRECISION ABSTOL, VL, VU +* .. +* .. Array Arguments .. +* INTEGER IFAIL( * ), IWORK( * ) +* DOUBLE PRECISION RWORK( * ), W( * ) +* COMPLEX*16 AB( LDAB, * ), Q( LDQ, * ), WORK( * ), +* $ Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZHBEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors +*> of a complex Hermitian band matrix A using the 2stage technique for +*> the reduction to tridiagonal. Eigenvalues and eigenvectors +*> can be selected by specifying either a range of values or a range of +*> indices for the desired eigenvalues. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] RANGE +*> \verbatim +*> RANGE is CHARACTER*1 +*> = 'A': all eigenvalues will be found; +*> = 'V': all eigenvalues in the half-open interval (VL,VU] +*> will be found; +*> = 'I': the IL-th through IU-th eigenvalues will be found. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is COMPLEX*16 array, dimension (LDAB, N) +*> On entry, the upper or lower triangle of the Hermitian band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> +*> On exit, AB is overwritten by values generated during the +*> reduction to tridiagonal form. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD + 1. +*> \endverbatim +*> +*> \param[out] Q +*> \verbatim +*> Q is COMPLEX*16 array, dimension (LDQ, N) +*> If JOBZ = 'V', the N-by-N unitary matrix used in the +*> reduction to tridiagonal form. +*> If JOBZ = 'N', the array Q is not referenced. +*> \endverbatim +*> +*> \param[in] LDQ +*> \verbatim +*> LDQ is INTEGER +*> The leading dimension of the array Q. If JOBZ = 'V', then +*> LDQ >= max(1,N). +*> \endverbatim +*> +*> \param[in] VL +*> \verbatim +*> VL is DOUBLE PRECISION +*> If RANGE='V', the lower bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] VU +*> \verbatim +*> VU is DOUBLE PRECISION +*> If RANGE='V', the upper bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] IL +*> \verbatim +*> IL is INTEGER +*> If RANGE='I', the index of the +*> smallest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] IU +*> \verbatim +*> IU is INTEGER +*> If RANGE='I', the index of the +*> largest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION +*> The absolute error tolerance for the eigenvalues. +*> An approximate eigenvalue is accepted as converged +*> when it is determined to lie in an interval [a,b] +*> of width less than or equal to +*> +*> ABSTOL + EPS * max( |a|,|b| ) , +*> +*> where EPS is the machine precision. If ABSTOL is less than +*> or equal to zero, then EPS*|T| will be used in its place, +*> where |T| is the 1-norm of the tridiagonal matrix obtained +*> by reducing AB to tridiagonal form. +*> +*> Eigenvalues will be computed most accurately when ABSTOL is +*> set to twice the underflow threshold 2*DLAMCH('S'), not zero. +*> If this routine returns with INFO>0, indicating that some +*> eigenvectors did not converge, try setting ABSTOL to +*> 2*DLAMCH('S'). +*> +*> See "Computing Small Singular Values of Bidiagonal Matrices +*> with Guaranteed High Relative Accuracy," by Demmel and +*> Kahan, LAPACK Working Note #3. +*> \endverbatim +*> +*> \param[out] M +*> \verbatim +*> M is INTEGER +*> The total number of eigenvalues found. 0 <= M <= N. +*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> The first M elements contain the selected eigenvalues in +*> ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is COMPLEX*16 array, dimension (LDZ, max(1,M)) +*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z +*> contain the orthonormal eigenvectors of the matrix A +*> corresponding to the selected eigenvalues, with the i-th +*> column of Z holding the eigenvector associated with W(i). +*> If an eigenvector fails to converge, then that column of Z +*> contains the latest approximation to the eigenvector, and the +*> index of the eigenvector is returned in IFAIL. +*> If JOBZ = 'N', then Z is not referenced. +*> Note: the user must ensure that at least max(1,M) columns are +*> supplied in the array Z; if RANGE = 'V', the exact value of M +*> is not known in advance and an upper bound must be used. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (LWORK) +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS +*> where KD is the size of the band. +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK, RWORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (7*N) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (5*N) +*> \endverbatim +*> +*> \param[out] IFAIL +*> \verbatim +*> IFAIL is INTEGER array, dimension (N) +*> If JOBZ = 'V', then if INFO = 0, the first M elements of +*> IFAIL are zero. If INFO > 0, then IFAIL contains the +*> indices of the eigenvectors that failed to converge. +*> If JOBZ = 'N', then IFAIL is not referenced. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, then i eigenvectors failed to converge. +*> Their indices are stored in array IFAIL. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup complex16OTHEReigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, + $ Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W, + $ Z, LDZ, WORK, LWORK, RWORK, IWORK, + $ IFAIL, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, RANGE, UPLO + INTEGER IL, INFO, IU, KD, LDAB, LDQ, LDZ, M, N, LWORK + DOUBLE PRECISION ABSTOL, VL, VU +* .. +* .. Array Arguments .. + INTEGER IFAIL( * ), IWORK( * ) + DOUBLE PRECISION RWORK( * ), W( * ) + COMPLEX*16 AB( LDAB, * ), Q( LDQ, * ), WORK( * ), + $ Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = ( 0.0D0, 0.0D0 ), + $ CONE = ( 1.0D0, 0.0D0 ) ) +* .. +* .. Local Scalars .. + LOGICAL ALLEIG, INDEIG, LOWER, TEST, VALEIG, WANTZ, + $ LQUERY + CHARACTER ORDER + INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, + $ INDISP, INDIWK, INDRWK, INDWRK, ISCALE, ITMP1, + $ LLWORK, LWMIN, LHTRD, LWTRD, IB, INDHOUS, + $ J, JJ, NSPLIT + DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, + $ SIGMA, SMLNUM, TMP1, VLL, VUU + COMPLEX*16 CTMP1 +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANHB + EXTERNAL LSAME, DLAMCH, ZLANHB, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DSCAL, DSTEBZ, DSTERF, XERBLA, ZCOPY, + $ ZGEMV, ZLACPY, ZLASCL, ZSTEIN, ZSTEQR, + $ ZSWAP, ZHETRD_HB2ST +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + ALLEIG = LSAME( RANGE, 'A' ) + VALEIG = LSAME( RANGE, 'V' ) + INDEIG = LSAME( RANGE, 'I' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN + INFO = -2 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( KD.LT.0 ) THEN + INFO = -5 + ELSE IF( LDAB.LT.KD+1 ) THEN + INFO = -7 + ELSE IF( WANTZ .AND. LDQ.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE + IF( VALEIG ) THEN + IF( N.GT.0 .AND. VU.LE.VL ) + $ INFO = -11 + ELSE IF( INDEIG ) THEN + IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN + INFO = -12 + ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN + INFO = -13 + END IF + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) + $ INFO = -18 + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + WORK( 1 ) = LWMIN + ELSE + IB = ILAENV( 18, 'ZHETRD_HB2ST', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'ZHETRD_HB2ST', JOBZ, N, KD, IB, -1 ) + LWMIN = LHTRD + LWTRD + WORK( 1 ) = LWMIN + ENDIF +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -20 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHBEVX_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + M = 0 + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + M = 1 + IF( LOWER ) THEN + CTMP1 = AB( 1, 1 ) + ELSE + CTMP1 = AB( KD+1, 1 ) + END IF + TMP1 = DBLE( CTMP1 ) + IF( VALEIG ) THEN + IF( .NOT.( VL.LT.TMP1 .AND. VU.GE.TMP1 ) ) + $ M = 0 + END IF + IF( M.EQ.1 ) THEN + W( 1 ) = DBLE( CTMP1 ) + IF( WANTZ ) + $ Z( 1, 1 ) = CONE + END IF + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) +* +* Scale matrix to allowable range, if necessary. +* + ISCALE = 0 + ABSTLL = ABSTOL + IF( VALEIG ) THEN + VLL = VL + VUU = VU + ELSE + VLL = ZERO + VUU = ZERO + END IF + ANRM = ZLANHB( 'M', UPLO, N, KD, AB, LDAB, RWORK ) + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + CALL ZLASCL( 'B', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + ELSE + CALL ZLASCL( 'Q', KD, KD, ONE, SIGMA, N, N, AB, LDAB, INFO ) + END IF + IF( ABSTOL.GT.0 ) + $ ABSTLL = ABSTOL*SIGMA + IF( VALEIG ) THEN + VLL = VL*SIGMA + VUU = VU*SIGMA + END IF + END IF +* +* Call ZHBTRD_HB2ST to reduce Hermitian band matrix to tridiagonal form. +* + INDD = 1 + INDE = INDD + N + INDRWK = INDE + N +* + INDHOUS = 1 + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL ZHETRD_HB2ST( 'N', JOBZ, UPLO, N, KD, AB, LDAB, + $ RWORK( INDD ), RWORK( INDE ), WORK( INDHOUS ), + $ LHTRD, WORK( INDWRK ), LLWORK, IINFO ) +* +* If all eigenvalues are desired and ABSTOL is less than or equal +* to zero, then call DSTERF or ZSTEQR. If this fails for some +* eigenvalue, then try DSTEBZ. +* + TEST = .FALSE. + IF (INDEIG) THEN + IF (IL.EQ.1 .AND. IU.EQ.N) THEN + TEST = .TRUE. + END IF + END IF + IF ((ALLEIG .OR. TEST) .AND. (ABSTOL.LE.ZERO)) THEN + CALL DCOPY( N, RWORK( INDD ), 1, W, 1 ) + INDEE = INDRWK + 2*N + IF( .NOT.WANTZ ) THEN + CALL DCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 ) + CALL DSTERF( N, W, RWORK( INDEE ), INFO ) + ELSE + CALL ZLACPY( 'A', N, N, Q, LDQ, Z, LDZ ) + CALL DCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 ) + CALL ZSTEQR( JOBZ, N, W, RWORK( INDEE ), Z, LDZ, + $ RWORK( INDRWK ), INFO ) + IF( INFO.EQ.0 ) THEN + DO 10 I = 1, N + IFAIL( I ) = 0 + 10 CONTINUE + END IF + END IF + IF( INFO.EQ.0 ) THEN + M = N + GO TO 30 + END IF + INFO = 0 + END IF +* +* Otherwise, call DSTEBZ and, if eigenvectors are desired, ZSTEIN. +* + IF( WANTZ ) THEN + ORDER = 'B' + ELSE + ORDER = 'E' + END IF + INDIBL = 1 + INDISP = INDIBL + N + INDIWK = INDISP + N + CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, + $ RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W, + $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ), + $ IWORK( INDIWK ), INFO ) +* + IF( WANTZ ) THEN + CALL ZSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W, + $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO ) +* +* Apply unitary matrix used in reduction to tridiagonal +* form to eigenvectors returned by ZSTEIN. +* + DO 20 J = 1, M + CALL ZCOPY( N, Z( 1, J ), 1, WORK( 1 ), 1 ) + CALL ZGEMV( 'N', N, N, CONE, Q, LDQ, WORK, 1, CZERO, + $ Z( 1, J ), 1 ) + 20 CONTINUE + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + 30 CONTINUE + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = M + ELSE + IMAX = INFO - 1 + END IF + CALL DSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* If eigenvalues are not in order, then sort them, along with +* eigenvectors. +* + IF( WANTZ ) THEN + DO 50 J = 1, M - 1 + I = 0 + TMP1 = W( J ) + DO 40 JJ = J + 1, M + IF( W( JJ ).LT.TMP1 ) THEN + I = JJ + TMP1 = W( JJ ) + END IF + 40 CONTINUE +* + IF( I.NE.0 ) THEN + ITMP1 = IWORK( INDIBL+I-1 ) + W( I ) = W( J ) + IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + W( J ) = TMP1 + IWORK( INDIBL+J-1 ) = ITMP1 + CALL ZSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) + IF( INFO.NE.0 ) THEN + ITMP1 = IFAIL( I ) + IFAIL( I ) = IFAIL( J ) + IFAIL( J ) = ITMP1 + END IF + END IF + 50 CONTINUE + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of ZHBEVX_2STAGE +* + END diff --git a/SRC/zheev_2stage.f b/SRC/zheev_2stage.f new file mode 100644 index 00000000..5aca4da2 --- /dev/null +++ b/SRC/zheev_2stage.f @@ -0,0 +1,355 @@ +*> \brief <b> ZHEEV_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b> +* +* @precisions fortran z -> s d c +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZHEEV_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zheev_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zheev_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zheev_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* RWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. +* DOUBLE PRECISION RWORK( * ), W( * ) +* COMPLEX*16 A( LDA, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZHEEV_2STAGE computes all eigenvalues and, optionally, eigenvectors of a +*> complex Hermitian matrix A using the 2stage technique for +*> the reduction to tridiagonal. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA, N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the +*> orthonormal eigenvectors of the matrix A. +*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') +*> or the upper triangle (if UPLO='U') of A, including the +*> diagonal, is destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (max(1, 3*N-2)) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the algorithm failed to converge; i +*> off-diagonal elements of an intermediate tridiagonal +*> form did not converge to zero. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complex16HEeigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + $ RWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION RWORK( * ), W( * ) + COMPLEX*16 A( LDA, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D0, 0.0D0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, LQUERY, WANTZ + INTEGER IINFO, IMAX, INDE, INDTAU, INDWRK, ISCALE, + $ LLWORK, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS + DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANHE + EXTERNAL LSAME, ILAENV, DLAMCH, ZLANHE +* .. +* .. External Subroutines .. + EXTERNAL DSCAL, DSTERF, XERBLA, ZLASCL, ZSTEQR, + $ ZUNGTR, ZHETRD_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF +* + IF( INFO.EQ.0 ) THEN + KD = ILAENV( 17, 'ZHETRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = N + LHTRD + LWTRD + WORK( 1 ) = LWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -8 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHEEV_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) THEN + RETURN + END IF +* + IF( N.EQ.1 ) THEN + W( 1 ) = DBLE( A( 1, 1 ) ) + WORK( 1 ) = 1 + IF( WANTZ ) + $ A( 1, 1 ) = CONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = ZLANHE( 'M', UPLO, N, A, LDA, RWORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) + $ CALL ZLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO ) +* +* Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. +* + INDE = 1 + INDTAU = 1 + INDHOUS = INDTAU + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ), + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call DSTERF. For eigenvectors, first call +* ZUNGTR to generate the unitary matrix, then call ZSTEQR. +* + IF( .NOT.WANTZ ) THEN + CALL DSTERF( N, W, RWORK( INDE ), INFO ) + ELSE + CALL ZUNGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ), + $ LLWORK, IINFO ) + INDWRK = INDE + N + CALL ZSTEQR( JOBZ, N, W, RWORK( INDE ), A, LDA, + $ RWORK( INDWRK ), INFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = N + ELSE + IMAX = INFO - 1 + END IF + CALL DSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* Set WORK(1) to optimal complex workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of ZHEEV_2STAGE +* + END diff --git a/SRC/zheevd_2stage.f b/SRC/zheevd_2stage.f new file mode 100644 index 00000000..79a0e886 --- /dev/null +++ b/SRC/zheevd_2stage.f @@ -0,0 +1,451 @@ +*> \brief <b> ZHEEVD_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b> +* +* @precisions fortran z -> s d c +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZHEEVD_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zheevd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zheevd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zheevd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZHEEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* RWORK, LRWORK, IWORK, LIWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, LDA, LIWORK, LRWORK, LWORK, N +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ) +* DOUBLE PRECISION RWORK( * ), W( * ) +* COMPLEX*16 A( LDA, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a +*> complex Hermitian matrix A using the 2stage technique for +*> the reduction to tridiagonal. If eigenvectors are desired, it uses a +*> divide and conquer algorithm. +*> +*> The divide and conquer algorithm makes very mild assumptions about +*> floating point arithmetic. It will work on machines with a guard +*> digit in add/subtract, or on those binary machines without guard +*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or +*> Cray-2. It could conceivably fail on hexadecimal or decimal machines +*> without guard digits, but we know of none. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA, N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the +*> orthonormal eigenvectors of the matrix A. +*> If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') +*> or the upper triangle (if UPLO='U') of A, including the +*> diagonal, is destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + N+1 +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + N+1 +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be at least 2*N + N**2 +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK, RWORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, +*> dimension (LRWORK) +*> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER +*> The dimension of the array RWORK. +*> If N <= 1, LRWORK must be at least 1. +*> If JOBZ = 'N' and N > 1, LRWORK must be at least N. +*> If JOBZ = 'V' and N > 1, LRWORK must be at least +*> 1 + 5*N + 2*N**2. +*> +*> If LRWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK, RWORK +*> and IWORK arrays, returns these values as the first entries +*> of the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK must be at least 1. +*> If JOBZ = 'N' and N > 1, LIWORK must be at least 1. +*> If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N. +*> +*> If LIWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK, RWORK +*> and IWORK arrays, returns these values as the first entries +*> of the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i and JOBZ = 'N', then the algorithm failed +*> to converge; i off-diagonal elements of an intermediate +*> tridiagonal form did not converge to zero; +*> if INFO = i and JOBZ = 'V', then the algorithm failed +*> to compute an eigenvalue while working on the submatrix +*> lying in rows and columns INFO/(N+1) through +*> mod(INFO,N+1). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complex16HEeigen +* +*> \par Further Details: +* ===================== +*> +*> Modified description of INFO. Sven, 16 Feb 05. +* +*> \par Contributors: +* ================== +*> +*> Jeff Rutter, Computer Science Division, University of California +*> at Berkeley, USA +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZHEEVD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + $ RWORK, LRWORK, IWORK, LIWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, LDA, LIWORK, LRWORK, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + DOUBLE PRECISION RWORK( * ), W( * ) + COMPLEX*16 A( LDA, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D0, 0.0D0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LOWER, LQUERY, WANTZ + INTEGER IINFO, IMAX, INDE, INDRWK, INDTAU, INDWK2, + $ INDWRK, ISCALE, LIWMIN, LLRWK, LLWORK, + $ LLWRK2, LRWMIN, LWMIN, + $ LHTRD, LWTRD, KD, IB, INDHOUS + + + DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, + $ SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANHE + EXTERNAL LSAME, ILAENV, DLAMCH, ZLANHE +* .. +* .. External Subroutines .. + EXTERNAL DSCAL, DSTERF, XERBLA, ZLACPY, ZLASCL, + $ ZSTEDC, ZUNMTR, ZHETRD_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + LOWER = LSAME( UPLO, 'L' ) + LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + KD = ILAENV( 17, 'ZHETRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + IF( WANTZ ) THEN + LWMIN = 2*N + N*N + LRWMIN = 1 + 5*N + 2*N**2 + LIWMIN = 3 + 5*N + ELSE + LWMIN = N + 1 + LHTRD + LWTRD + LRWMIN = N + LIWMIN = 1 + END IF + END IF + WORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN + IWORK( 1 ) = LIWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -8 + ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN + INFO = -10 + ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN + INFO = -12 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHEEVD_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( N.EQ.1 ) THEN + W( 1 ) = DBLE( A( 1, 1 ) ) + IF( WANTZ ) + $ A( 1, 1 ) = CONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = SQRT( BIGNUM ) +* +* Scale matrix to allowable range, if necessary. +* + ANRM = ZLANHE( 'M', UPLO, N, A, LDA, RWORK ) + ISCALE = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) + $ CALL ZLASCL( UPLO, 0, 0, ONE, SIGMA, N, N, A, LDA, INFO ) +* +* Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. +* + INDE = 1 + INDRWK = INDE + N + LLRWK = LRWORK - INDRWK + 1 + INDTAU = 1 + INDHOUS = INDTAU + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 + INDWK2 = INDWRK + N*N + LLWRK2 = LWORK - INDWK2 + 1 +* + CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ), + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDWRK ), LLWORK, IINFO ) +* +* For eigenvalues only, call DSTERF. For eigenvectors, first call +* ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the +* tridiagonal matrix, then call ZUNMTR to multiply it to the +* Householder transformations represented as Householder vectors in +* A. +* + IF( .NOT.WANTZ ) THEN + CALL DSTERF( N, W, RWORK( INDE ), INFO ) + ELSE + CALL ZSTEDC( 'I', N, W, RWORK( INDE ), WORK( INDWRK ), N, + $ WORK( INDWK2 ), LLWRK2, RWORK( INDRWK ), LLRWK, + $ IWORK, LIWORK, INFO ) + CALL ZUNMTR( 'L', UPLO, 'N', N, N, A, LDA, WORK( INDTAU ), + $ WORK( INDWRK ), N, WORK( INDWK2 ), LLWRK2, IINFO ) + CALL ZLACPY( 'A', N, N, WORK( INDWRK ), N, A, LDA ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = N + ELSE + IMAX = INFO - 1 + END IF + CALL DSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* + WORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN + IWORK( 1 ) = LIWMIN +* + RETURN +* +* End of ZHEEVD_2STAGE +* + END diff --git a/SRC/zheevr_2stage.f b/SRC/zheevr_2stage.f new file mode 100644 index 00000000..bfd43056 --- /dev/null +++ b/SRC/zheevr_2stage.f @@ -0,0 +1,779 @@ +*> \brief <b> ZHEEVR_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b> +* +* @precisions fortran z -> s d c +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZHEEVR_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zheevr_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zheevr_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zheevr_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZHEEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, +* IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, +* WORK, LWORK, RWORK, LRWORK, IWORK, +* LIWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, RANGE, UPLO +* INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LRWORK, LWORK, +* $ M, N +* DOUBLE PRECISION ABSTOL, VL, VU +* .. +* .. Array Arguments .. +* INTEGER ISUPPZ( * ), IWORK( * ) +* DOUBLE PRECISION RWORK( * ), W( * ) +* COMPLEX*16 A( LDA, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZHEEVR_2STAGE computes selected eigenvalues and, optionally, eigenvectors +*> of a complex Hermitian matrix A using the 2stage technique for +*> the reduction to tridiagonal. Eigenvalues and eigenvectors can +*> be selected by specifying either a range of values or a range of +*> indices for the desired eigenvalues. +*> +*> ZHEEVR_2STAGE first reduces the matrix A to tridiagonal form T with a call +*> to ZHETRD. Then, whenever possible, ZHEEVR_2STAGE calls ZSTEMR to compute +*> eigenspectrum using Relatively Robust Representations. ZSTEMR +*> computes eigenvalues by the dqds algorithm, while orthogonal +*> eigenvectors are computed from various "good" L D L^T representations +*> (also known as Relatively Robust Representations). Gram-Schmidt +*> orthogonalization is avoided as far as possible. More specifically, +*> the various steps of the algorithm are as follows. +*> +*> For each unreduced block (submatrix) of T, +*> (a) Compute T - sigma I = L D L^T, so that L and D +*> define all the wanted eigenvalues to high relative accuracy. +*> This means that small relative changes in the entries of D and L +*> cause only small relative changes in the eigenvalues and +*> eigenvectors. The standard (unfactored) representation of the +*> tridiagonal matrix T does not have this property in general. +*> (b) Compute the eigenvalues to suitable accuracy. +*> If the eigenvectors are desired, the algorithm attains full +*> accuracy of the computed eigenvalues only right before +*> the corresponding vectors have to be computed, see steps c) and d). +*> (c) For each cluster of close eigenvalues, select a new +*> shift close to the cluster, find a new factorization, and refine +*> the shifted eigenvalues to suitable accuracy. +*> (d) For each eigenvalue with a large enough relative separation compute +*> the corresponding eigenvector by forming a rank revealing twisted +*> factorization. Go back to (c) for any clusters that remain. +*> +*> The desired accuracy of the output can be specified by the input +*> parameter ABSTOL. +*> +*> For more details, see DSTEMR's documentation and: +*> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations +*> to compute orthogonal eigenvectors of symmetric tridiagonal matrices," +*> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004. +*> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and +*> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25, +*> 2004. Also LAPACK Working Note 154. +*> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric +*> tridiagonal eigenvalue/eigenvector problem", +*> Computer Science Division Technical Report No. UCB/CSD-97-971, +*> UC Berkeley, May 1997. +*> +*> +*> Note 1 : ZHEEVR_2STAGE calls ZSTEMR when the full spectrum is requested +*> on machines which conform to the ieee-754 floating point standard. +*> ZHEEVR_2STAGE calls DSTEBZ and ZSTEIN on non-ieee machines and +*> when partial spectrum requests are made. +*> +*> Normal execution of ZSTEMR may create NaNs and infinities and +*> hence may abort due to a floating point exception in environments +*> which do not handle NaNs and infinities in the ieee standard default +*> manner. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] RANGE +*> \verbatim +*> RANGE is CHARACTER*1 +*> = 'A': all eigenvalues will be found. +*> = 'V': all eigenvalues in the half-open interval (VL,VU] +*> will be found. +*> = 'I': the IL-th through IU-th eigenvalues will be found. +*> For RANGE = 'V' or 'I' and IU - IL < N - 1, DSTEBZ and +*> ZSTEIN are called +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA, N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, the lower triangle (if UPLO='L') or the upper +*> triangle (if UPLO='U') of A, including the diagonal, is +*> destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in] VL +*> \verbatim +*> VL is DOUBLE PRECISION +*> If RANGE='V', the lower bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] VU +*> \verbatim +*> VU is DOUBLE PRECISION +*> If RANGE='V', the upper bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] IL +*> \verbatim +*> IL is INTEGER +*> If RANGE='I', the index of the +*> smallest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] IU +*> \verbatim +*> IU is INTEGER +*> If RANGE='I', the index of the +*> largest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION +*> The absolute error tolerance for the eigenvalues. +*> An approximate eigenvalue is accepted as converged +*> when it is determined to lie in an interval [a,b] +*> of width less than or equal to +*> +*> ABSTOL + EPS * max( |a|,|b| ) , +*> +*> where EPS is the machine precision. If ABSTOL is less than +*> or equal to zero, then EPS*|T| will be used in its place, +*> where |T| is the 1-norm of the tridiagonal matrix obtained +*> by reducing A to tridiagonal form. +*> +*> See "Computing Small Singular Values of Bidiagonal Matrices +*> with Guaranteed High Relative Accuracy," by Demmel and +*> Kahan, LAPACK Working Note #3. +*> +*> If high relative accuracy is important, set ABSTOL to +*> DLAMCH( 'Safe minimum' ). Doing so will guarantee that +*> eigenvalues are computed to high relative accuracy when +*> possible in future releases. The current code does not +*> make any guarantees about high relative accuracy, but +*> furutre releases will. See J. Barlow and J. Demmel, +*> "Computing Accurate Eigensystems of Scaled Diagonally +*> Dominant Matrices", LAPACK Working Note #7, for a discussion +*> of which matrices define their eigenvalues to high relative +*> accuracy. +*> \endverbatim +*> +*> \param[out] M +*> \verbatim +*> M is INTEGER +*> The total number of eigenvalues found. 0 <= M <= N. +*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> The first M elements contain the selected eigenvalues in +*> ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is COMPLEX*16 array, dimension (LDZ, max(1,M)) +*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z +*> contain the orthonormal eigenvectors of the matrix A +*> corresponding to the selected eigenvalues, with the i-th +*> column of Z holding the eigenvector associated with W(i). +*> If JOBZ = 'N', then Z is not referenced. +*> Note: the user must ensure that at least max(1,M) columns are +*> supplied in the array Z; if RANGE = 'V', the exact value of M +*> is not known in advance and an upper bound must be used. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] ISUPPZ +*> \verbatim +*> ISUPPZ is INTEGER array, dimension ( 2*max(1,M) ) +*> The support of the eigenvectors in Z, i.e., the indices +*> indicating the nonzero elements in Z. The i-th eigenvector +*> is nonzero only in elements ISUPPZ( 2*i-1 ) through +*> ISUPPZ( 2*i ). This is an output of ZSTEMR (tridiagonal +*> matrix). The support of the eigenvectors of A is typically +*> 1:N because of the unitary transformations applied by ZUNMTR. +*> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, 26*N, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal sizes of the WORK, RWORK and +*> IWORK arrays, returns these values as the first entries of +*> the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK)) +*> On exit, if INFO = 0, RWORK(1) returns the optimal +*> (and minimal) LRWORK. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER +*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> +*> If LRWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK, RWORK +*> and IWORK arrays, returns these values as the first entries +*> of the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal +*> (and minimal) LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> +*> If LIWORK = -1, then a workspace query is assumed; the +*> routine only calculates the optimal sizes of the WORK, RWORK +*> and IWORK arrays, returns these values as the first entries +*> of the WORK, RWORK and IWORK arrays, and no error message +*> related to LWORK or LRWORK or LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: Internal error +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup complex16HEeigen +* +*> \par Contributors: +* ================== +*> +*> Inderjit Dhillon, IBM Almaden, USA \n +*> Osni Marques, LBNL/NERSC, USA \n +*> Ken Stanley, Computer Science Division, University of +*> California at Berkeley, USA \n +*> Jason Riedy, Computer Science Division, University of +*> California at Berkeley, USA \n +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZHEEVR_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, + $ IL, IU, ABSTOL, M, W, Z, LDZ, ISUPPZ, + $ WORK, LWORK, RWORK, LRWORK, IWORK, + $ LIWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, RANGE, UPLO + INTEGER IL, INFO, IU, LDA, LDZ, LIWORK, LRWORK, LWORK, + $ M, N + DOUBLE PRECISION ABSTOL, VL, VU +* .. +* .. Array Arguments .. + INTEGER ISUPPZ( * ), IWORK( * ) + DOUBLE PRECISION RWORK( * ), W( * ) + COMPLEX*16 A( LDA, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG, + $ WANTZ, TRYRAC + CHARACTER ORDER + INTEGER I, IEEEOK, IINFO, IMAX, INDIBL, INDIFL, INDISP, + $ INDIWO, INDRD, INDRDD, INDRE, INDREE, INDRWK, + $ INDTAU, INDWK, INDWKN, ISCALE, ITMP1, J, JJ, + $ LIWMIN, LLWORK, LLRWORK, LLWRKN, LRWMIN, + $ LWMIN, NSPLIT, LHTRD, LWTRD, KD, IB, INDHOUS + DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, + $ SIGMA, SMLNUM, TMP1, VLL, VUU +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANSY + EXTERNAL LSAME, ILAENV, DLAMCH, ZLANSY +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DSCAL, DSTEBZ, DSTERF, XERBLA, ZDSCAL, + $ ZHETRD_2STAGE, ZSTEMR, ZSTEIN, ZSWAP, ZUNMTR +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IEEEOK = ILAENV( 10, 'ZHEEVR', 'N', 1, 2, 3, 4 ) +* + LOWER = LSAME( UPLO, 'L' ) + WANTZ = LSAME( JOBZ, 'V' ) + ALLEIG = LSAME( RANGE, 'A' ) + VALEIG = LSAME( RANGE, 'V' ) + INDEIG = LSAME( RANGE, 'I' ) +* + LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR. + $ ( LIWORK.EQ.-1 ) ) +* + KD = ILAENV( 17, 'DSYTRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = N + LHTRD + LWTRD + LRWMIN = MAX( 1, 24*N ) + LIWMIN = MAX( 1, 10*N ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN + INFO = -2 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE + IF( VALEIG ) THEN + IF( N.GT.0 .AND. VU.LE.VL ) + $ INFO = -8 + ELSE IF( INDEIG ) THEN + IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN + INFO = -10 + END IF + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -15 + END IF + END IF +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN + IWORK( 1 ) = LIWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -18 + ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN + INFO = -20 + ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN + INFO = -22 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHEEVR_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + M = 0 + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* + IF( N.EQ.1 ) THEN + WORK( 1 ) = 2 + IF( ALLEIG .OR. INDEIG ) THEN + M = 1 + W( 1 ) = DBLE( A( 1, 1 ) ) + ELSE + IF( VL.LT.DBLE( A( 1, 1 ) ) .AND. VU.GE.DBLE( A( 1, 1 ) ) ) + $ THEN + M = 1 + W( 1 ) = DBLE( A( 1, 1 ) ) + END IF + END IF + IF( WANTZ ) THEN + Z( 1, 1 ) = ONE + ISUPPZ( 1 ) = 1 + ISUPPZ( 2 ) = 1 + END IF + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) +* +* Scale matrix to allowable range, if necessary. +* + ISCALE = 0 + ABSTLL = ABSTOL + IF (VALEIG) THEN + VLL = VL + VUU = VU + END IF + ANRM = ZLANSY( 'M', UPLO, N, A, LDA, RWORK ) + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + DO 10 J = 1, N + CALL ZDSCAL( N-J+1, SIGMA, A( J, J ), 1 ) + 10 CONTINUE + ELSE + DO 20 J = 1, N + CALL ZDSCAL( J, SIGMA, A( 1, J ), 1 ) + 20 CONTINUE + END IF + IF( ABSTOL.GT.0 ) + $ ABSTLL = ABSTOL*SIGMA + IF( VALEIG ) THEN + VLL = VL*SIGMA + VUU = VU*SIGMA + END IF + END IF + +* Initialize indices into workspaces. Note: The IWORK indices are +* used only if DSTERF or ZSTEMR fail. + +* WORK(INDTAU:INDTAU+N-1) stores the complex scalar factors of the +* elementary reflectors used in ZHETRD. + INDTAU = 1 +* INDWK is the starting offset of the remaining complex workspace, +* and LLWORK is the remaining complex workspace size. + INDHOUS = INDTAU + N + INDWK = INDHOUS + LHTRD + LLWORK = LWORK - INDWK + 1 + +* RWORK(INDRD:INDRD+N-1) stores the real tridiagonal's diagonal +* entries. + INDRD = 1 +* RWORK(INDRE:INDRE+N-1) stores the off-diagonal entries of the +* tridiagonal matrix from ZHETRD. + INDRE = INDRD + N +* RWORK(INDRDD:INDRDD+N-1) is a copy of the diagonal entries over +* -written by ZSTEMR (the DSTERF path copies the diagonal to W). + INDRDD = INDRE + N +* RWORK(INDREE:INDREE+N-1) is a copy of the off-diagonal entries over +* -written while computing the eigenvalues in DSTERF and ZSTEMR. + INDREE = INDRDD + N +* INDRWK is the starting offset of the left-over real workspace, and +* LLRWORK is the remaining workspace size. + INDRWK = INDREE + N + LLRWORK = LRWORK - INDRWK + 1 + +* IWORK(INDIBL:INDIBL+M-1) corresponds to IBLOCK in DSTEBZ and +* stores the block indices of each of the M<=N eigenvalues. + INDIBL = 1 +* IWORK(INDISP:INDISP+NSPLIT-1) corresponds to ISPLIT in DSTEBZ and +* stores the starting and finishing indices of each block. + INDISP = INDIBL + N +* IWORK(INDIFL:INDIFL+N-1) stores the indices of eigenvectors +* that corresponding to eigenvectors that fail to converge in +* ZSTEIN. This information is discarded; if any fail, the driver +* returns INFO > 0. + INDIFL = INDISP + N +* INDIWO is the offset of the remaining integer workspace. + INDIWO = INDIFL + N + +* +* Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. +* + CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), + $ RWORK( INDRE ), WORK( INDTAU ), + $ WORK( INDHOUS ), LHTRD, + $ WORK( INDWK ), LLWORK, IINFO ) +* +* If all eigenvalues are desired +* then call DSTERF or ZSTEMR and ZUNMTR. +* + TEST = .FALSE. + IF( INDEIG ) THEN + IF( IL.EQ.1 .AND. IU.EQ.N ) THEN + TEST = .TRUE. + END IF + END IF + IF( ( ALLEIG.OR.TEST ) .AND. ( IEEEOK.EQ.1 ) ) THEN + IF( .NOT.WANTZ ) THEN + CALL DCOPY( N, RWORK( INDRD ), 1, W, 1 ) + CALL DCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 ) + CALL DSTERF( N, W, RWORK( INDREE ), INFO ) + ELSE + CALL DCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 ) + CALL DCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 ) +* + IF (ABSTOL .LE. TWO*N*EPS) THEN + TRYRAC = .TRUE. + ELSE + TRYRAC = .FALSE. + END IF + CALL ZSTEMR( JOBZ, 'A', N, RWORK( INDRDD ), + $ RWORK( INDREE ), VL, VU, IL, IU, M, W, + $ Z, LDZ, N, ISUPPZ, TRYRAC, + $ RWORK( INDRWK ), LLRWORK, + $ IWORK, LIWORK, INFO ) +* +* Apply unitary matrix used in reduction to tridiagonal +* form to eigenvectors returned by ZSTEMR. +* + IF( WANTZ .AND. INFO.EQ.0 ) THEN + INDWKN = INDWK + LLWRKN = LWORK - INDWKN + 1 + CALL ZUNMTR( 'L', UPLO, 'N', N, M, A, LDA, + $ WORK( INDTAU ), Z, LDZ, WORK( INDWKN ), + $ LLWRKN, IINFO ) + END IF + END IF +* +* + IF( INFO.EQ.0 ) THEN + M = N + GO TO 30 + END IF + INFO = 0 + END IF +* +* Otherwise, call DSTEBZ and, if eigenvectors are desired, ZSTEIN. +* Also call DSTEBZ and ZSTEIN if ZSTEMR fails. +* + IF( WANTZ ) THEN + ORDER = 'B' + ELSE + ORDER = 'E' + END IF + + CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, + $ RWORK( INDRD ), RWORK( INDRE ), M, NSPLIT, W, + $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ), + $ IWORK( INDIWO ), INFO ) +* + IF( WANTZ ) THEN + CALL ZSTEIN( N, RWORK( INDRD ), RWORK( INDRE ), M, W, + $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ RWORK( INDRWK ), IWORK( INDIWO ), IWORK( INDIFL ), + $ INFO ) +* +* Apply unitary matrix used in reduction to tridiagonal +* form to eigenvectors returned by ZSTEIN. +* + INDWKN = INDWK + LLWRKN = LWORK - INDWKN + 1 + CALL ZUNMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z, + $ LDZ, WORK( INDWKN ), LLWRKN, IINFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + 30 CONTINUE + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = M + ELSE + IMAX = INFO - 1 + END IF + CALL DSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* If eigenvalues are not in order, then sort them, along with +* eigenvectors. +* + IF( WANTZ ) THEN + DO 50 J = 1, M - 1 + I = 0 + TMP1 = W( J ) + DO 40 JJ = J + 1, M + IF( W( JJ ).LT.TMP1 ) THEN + I = JJ + TMP1 = W( JJ ) + END IF + 40 CONTINUE +* + IF( I.NE.0 ) THEN + ITMP1 = IWORK( INDIBL+I-1 ) + W( I ) = W( J ) + IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + W( J ) = TMP1 + IWORK( INDIBL+J-1 ) = ITMP1 + CALL ZSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) + END IF + 50 CONTINUE + END IF +* +* Set WORK(1) to optimal workspace size. +* + WORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN + IWORK( 1 ) = LIWMIN +* + RETURN +* +* End of ZHEEVR_2STAGE +* + END diff --git a/SRC/zheevx_2stage.f b/SRC/zheevx_2stage.f new file mode 100644 index 00000000..e33d55e0 --- /dev/null +++ b/SRC/zheevx_2stage.f @@ -0,0 +1,618 @@ +*> \brief <b> ZHEEVX_2STAGE computes the eigenvalues and, optionally, the left and/or right eigenvectors for HE matrices</b> +* +* @precisions fortran z -> s d c +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZHEEVX_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zheevx_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zheevx_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zheevx_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZHEEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, +* IL, IU, ABSTOL, M, W, Z, LDZ, WORK, +* LWORK, RWORK, IWORK, IFAIL, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, RANGE, UPLO +* INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N +* DOUBLE PRECISION ABSTOL, VL, VU +* .. +* .. Array Arguments .. +* INTEGER IFAIL( * ), IWORK( * ) +* DOUBLE PRECISION RWORK( * ), W( * ) +* COMPLEX*16 A( LDA, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZHEEVX_2STAGE computes selected eigenvalues and, optionally, eigenvectors +*> of a complex Hermitian matrix A using the 2stage technique for +*> the reduction to tridiagonal. Eigenvalues and eigenvectors can +*> be selected by specifying either a range of values or a range of +*> indices for the desired eigenvalues. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] RANGE +*> \verbatim +*> RANGE is CHARACTER*1 +*> = 'A': all eigenvalues will be found. +*> = 'V': all eigenvalues in the half-open interval (VL,VU] +*> will be found. +*> = 'I': the IL-th through IU-th eigenvalues will be found. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA, N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> On exit, the lower triangle (if UPLO='L') or the upper +*> triangle (if UPLO='U') of A, including the diagonal, is +*> destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in] VL +*> \verbatim +*> VL is DOUBLE PRECISION +*> If RANGE='V', the lower bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] VU +*> \verbatim +*> VU is DOUBLE PRECISION +*> If RANGE='V', the upper bound of the interval to +*> be searched for eigenvalues. VL < VU. +*> Not referenced if RANGE = 'A' or 'I'. +*> \endverbatim +*> +*> \param[in] IL +*> \verbatim +*> IL is INTEGER +*> If RANGE='I', the index of the +*> smallest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] IU +*> \verbatim +*> IU is INTEGER +*> If RANGE='I', the index of the +*> largest eigenvalue to be returned. +*> 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. +*> Not referenced if RANGE = 'A' or 'V'. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION +*> The absolute error tolerance for the eigenvalues. +*> An approximate eigenvalue is accepted as converged +*> when it is determined to lie in an interval [a,b] +*> of width less than or equal to +*> +*> ABSTOL + EPS * max( |a|,|b| ) , +*> +*> where EPS is the machine precision. If ABSTOL is less than +*> or equal to zero, then EPS*|T| will be used in its place, +*> where |T| is the 1-norm of the tridiagonal matrix obtained +*> by reducing A to tridiagonal form. +*> +*> Eigenvalues will be computed most accurately when ABSTOL is +*> set to twice the underflow threshold 2*DLAMCH('S'), not zero. +*> If this routine returns with INFO>0, indicating that some +*> eigenvectors did not converge, try setting ABSTOL to +*> 2*DLAMCH('S'). +*> +*> See "Computing Small Singular Values of Bidiagonal Matrices +*> with Guaranteed High Relative Accuracy," by Demmel and +*> Kahan, LAPACK Working Note #3. +*> \endverbatim +*> +*> \param[out] M +*> \verbatim +*> M is INTEGER +*> The total number of eigenvalues found. 0 <= M <= N. +*> If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> On normal exit, the first M elements contain the selected +*> eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] Z +*> \verbatim +*> Z is COMPLEX*16 array, dimension (LDZ, max(1,M)) +*> If JOBZ = 'V', then if INFO = 0, the first M columns of Z +*> contain the orthonormal eigenvectors of the matrix A +*> corresponding to the selected eigenvalues, with the i-th +*> column of Z holding the eigenvector associated with W(i). +*> If an eigenvector fails to converge, then that column of Z +*> contains the latest approximation to the eigenvector, and the +*> index of the eigenvector is returned in IFAIL. +*> If JOBZ = 'N', then Z is not referenced. +*> Note: the user must ensure that at least max(1,M) columns are +*> supplied in the array Z; if RANGE = 'V', the exact value of M +*> is not known in advance and an upper bound must be used. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is INTEGER +*> The leading dimension of the array Z. LDZ >= 1, and if +*> JOBZ = 'V', LDZ >= max(1,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, 8*N, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (7*N) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (5*N) +*> \endverbatim +*> +*> \param[out] IFAIL +*> \verbatim +*> IFAIL is INTEGER array, dimension (N) +*> If JOBZ = 'V', then if INFO = 0, the first M elements of +*> IFAIL are zero. If INFO > 0, then IFAIL contains the +*> indices of the eigenvectors that failed to converge. +*> If JOBZ = 'N', then IFAIL is not referenced. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, then i eigenvectors failed to converge. +*> Their indices are stored in array IFAIL. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup complex16HEeigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZHEEVX_2STAGE( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, + $ IL, IU, ABSTOL, M, W, Z, LDZ, WORK, + $ LWORK, RWORK, IWORK, IFAIL, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.1) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, RANGE, UPLO + INTEGER IL, INFO, IU, LDA, LDZ, LWORK, M, N + DOUBLE PRECISION ABSTOL, VL, VU +* .. +* .. Array Arguments .. + INTEGER IFAIL( * ), IWORK( * ) + DOUBLE PRECISION RWORK( * ), W( * ) + COMPLEX*16 A( LDA, * ), WORK( * ), Z( LDZ, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, TEST, VALEIG, + $ WANTZ + CHARACTER ORDER + INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, + $ INDISP, INDIWK, INDRWK, INDTAU, INDWRK, ISCALE, + $ ITMP1, J, JJ, LLWORK, + $ NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS + DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, + $ SIGMA, SMLNUM, TMP1, VLL, VUU +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANHE + EXTERNAL LSAME, ILAENV, DLAMCH, ZLANHE +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DSCAL, DSTEBZ, DSTERF, XERBLA, ZDSCAL, + $ ZLACPY, ZSTEIN, ZSTEQR, ZSWAP, ZUNGTR, ZUNMTR, + $ ZHETRD_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LOWER = LSAME( UPLO, 'L' ) + WANTZ = LSAME( JOBZ, 'V' ) + ALLEIG = LSAME( RANGE, 'A' ) + VALEIG = LSAME( RANGE, 'V' ) + INDEIG = LSAME( RANGE, 'I' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -1 + ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN + INFO = -2 + ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE + IF( VALEIG ) THEN + IF( N.GT.0 .AND. VU.LE.VL ) + $ INFO = -8 + ELSE IF( INDEIG ) THEN + IF( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) THEN + INFO = -10 + END IF + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN + INFO = -15 + END IF + END IF +* + IF( INFO.EQ.0 ) THEN + IF( N.LE.1 ) THEN + LWMIN = 1 + WORK( 1 ) = LWMIN + ELSE + KD = ILAENV( 17, 'ZHETRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = N + LHTRD + LWTRD + WORK( 1 ) = LWMIN + END IF +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) + $ INFO = -17 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHEEVX_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + M = 0 + IF( N.EQ.0 ) THEN + RETURN + END IF +* + IF( N.EQ.1 ) THEN + IF( ALLEIG .OR. INDEIG ) THEN + M = 1 + W( 1 ) = DBLE( A( 1, 1 ) ) + ELSE IF( VALEIG ) THEN + IF( VL.LT.DBLE( A( 1, 1 ) ) .AND. VU.GE.DBLE( A( 1, 1 ) ) ) + $ THEN + M = 1 + W( 1 ) = DBLE( A( 1, 1 ) ) + END IF + END IF + IF( WANTZ ) + $ Z( 1, 1 ) = CONE + RETURN + END IF +* +* Get machine constants. +* + SAFMIN = DLAMCH( 'Safe minimum' ) + EPS = DLAMCH( 'Precision' ) + SMLNUM = SAFMIN / EPS + BIGNUM = ONE / SMLNUM + RMIN = SQRT( SMLNUM ) + RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) +* +* Scale matrix to allowable range, if necessary. +* + ISCALE = 0 + ABSTLL = ABSTOL + IF( VALEIG ) THEN + VLL = VL + VUU = VU + END IF + ANRM = ZLANHE( 'M', UPLO, N, A, LDA, RWORK ) + IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN + ISCALE = 1 + SIGMA = RMIN / ANRM + ELSE IF( ANRM.GT.RMAX ) THEN + ISCALE = 1 + SIGMA = RMAX / ANRM + END IF + IF( ISCALE.EQ.1 ) THEN + IF( LOWER ) THEN + DO 10 J = 1, N + CALL ZDSCAL( N-J+1, SIGMA, A( J, J ), 1 ) + 10 CONTINUE + ELSE + DO 20 J = 1, N + CALL ZDSCAL( J, SIGMA, A( 1, J ), 1 ) + 20 CONTINUE + END IF + IF( ABSTOL.GT.0 ) + $ ABSTLL = ABSTOL*SIGMA + IF( VALEIG ) THEN + VLL = VL*SIGMA + VUU = VU*SIGMA + END IF + END IF +* +* Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. +* + INDD = 1 + INDE = INDD + N + INDRWK = INDE + N + INDTAU = 1 + INDHOUS = INDTAU + N + INDWRK = INDHOUS + LHTRD + LLWORK = LWORK - INDWRK + 1 +* + CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDD ), + $ RWORK( INDE ), WORK( INDTAU ), + $ WORK( INDHOUS ), LHTRD, WORK( INDWRK ), + $ LLWORK, IINFO ) +* +* If all eigenvalues are desired and ABSTOL is less than or equal to +* zero, then call DSTERF or ZUNGTR and ZSTEQR. If this fails for +* some eigenvalue, then try DSTEBZ. +* + TEST = .FALSE. + IF( INDEIG ) THEN + IF( IL.EQ.1 .AND. IU.EQ.N ) THEN + TEST = .TRUE. + END IF + END IF + IF( ( ALLEIG .OR. TEST ) .AND. ( ABSTOL.LE.ZERO ) ) THEN + CALL DCOPY( N, RWORK( INDD ), 1, W, 1 ) + INDEE = INDRWK + 2*N + IF( .NOT.WANTZ ) THEN + CALL DCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 ) + CALL DSTERF( N, W, RWORK( INDEE ), INFO ) + ELSE + CALL ZLACPY( 'A', N, N, A, LDA, Z, LDZ ) + CALL ZUNGTR( UPLO, N, Z, LDZ, WORK( INDTAU ), + $ WORK( INDWRK ), LLWORK, IINFO ) + CALL DCOPY( N-1, RWORK( INDE ), 1, RWORK( INDEE ), 1 ) + CALL ZSTEQR( JOBZ, N, W, RWORK( INDEE ), Z, LDZ, + $ RWORK( INDRWK ), INFO ) + IF( INFO.EQ.0 ) THEN + DO 30 I = 1, N + IFAIL( I ) = 0 + 30 CONTINUE + END IF + END IF + IF( INFO.EQ.0 ) THEN + M = N + GO TO 40 + END IF + INFO = 0 + END IF +* +* Otherwise, call DSTEBZ and, if eigenvectors are desired, ZSTEIN. +* + IF( WANTZ ) THEN + ORDER = 'B' + ELSE + ORDER = 'E' + END IF + INDIBL = 1 + INDISP = INDIBL + N + INDIWK = INDISP + N + CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, + $ RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W, + $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ), + $ IWORK( INDIWK ), INFO ) +* + IF( WANTZ ) THEN + CALL ZSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W, + $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO ) +* +* Apply unitary matrix used in reduction to tridiagonal +* form to eigenvectors returned by ZSTEIN. +* + CALL ZUNMTR( 'L', UPLO, 'N', N, M, A, LDA, WORK( INDTAU ), Z, + $ LDZ, WORK( INDWRK ), LLWORK, IINFO ) + END IF +* +* If matrix was scaled, then rescale eigenvalues appropriately. +* + 40 CONTINUE + IF( ISCALE.EQ.1 ) THEN + IF( INFO.EQ.0 ) THEN + IMAX = M + ELSE + IMAX = INFO - 1 + END IF + CALL DSCAL( IMAX, ONE / SIGMA, W, 1 ) + END IF +* +* If eigenvalues are not in order, then sort them, along with +* eigenvectors. +* + IF( WANTZ ) THEN + DO 60 J = 1, M - 1 + I = 0 + TMP1 = W( J ) + DO 50 JJ = J + 1, M + IF( W( JJ ).LT.TMP1 ) THEN + I = JJ + TMP1 = W( JJ ) + END IF + 50 CONTINUE +* + IF( I.NE.0 ) THEN + ITMP1 = IWORK( INDIBL+I-1 ) + W( I ) = W( J ) + IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + W( J ) = TMP1 + IWORK( INDIBL+J-1 ) = ITMP1 + CALL ZSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) + IF( INFO.NE.0 ) THEN + ITMP1 = IFAIL( I ) + IFAIL( I ) = IFAIL( J ) + IFAIL( J ) = ITMP1 + END IF + END IF + 60 CONTINUE + END IF +* +* Set WORK(1) to optimal complex workspace size. +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of ZHEEVX_2STAGE +* + END diff --git a/SRC/zhegv_2stage.f b/SRC/zhegv_2stage.f new file mode 100644 index 00000000..5079d240 --- /dev/null +++ b/SRC/zhegv_2stage.f @@ -0,0 +1,379 @@ +*> \brief \b ZHEGV_2STAGE +* +* @precisions fortran z -> c +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZHEGV_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhegv_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhegv_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhegv_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZHEGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, +* WORK, LWORK, RWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER JOBZ, UPLO +* INTEGER INFO, ITYPE, LDA, LDB, LWORK, N +* .. +* .. Array Arguments .. +* DOUBLE PRECISION RWORK( * ), W( * ) +* COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZHEGV_2STAGE computes all the eigenvalues, and optionally, the eigenvectors +*> of a complex generalized Hermitian-definite eigenproblem, of the form +*> A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. +*> Here A and B are assumed to be Hermitian and B is also +*> positive definite. +*> This routine use the 2stage technique for the reduction to tridiagonal +*> which showed higher performance on recent architecture and for large +* sizes N>2000. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] ITYPE +*> \verbatim +*> ITYPE is INTEGER +*> Specifies the problem type to be solved: +*> = 1: A*x = (lambda)*B*x +*> = 2: A*B*x = (lambda)*x +*> = 3: B*A*x = (lambda)*x +*> \endverbatim +*> +*> \param[in] JOBZ +*> \verbatim +*> JOBZ is CHARACTER*1 +*> = 'N': Compute eigenvalues only; +*> = 'V': Compute eigenvalues and eigenvectors. +*> Not available in this release. +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangles of A and B are stored; +*> = 'L': Lower triangles of A and B are stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrices A and B. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA, N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the +*> leading N-by-N upper triangular part of A contains the +*> upper triangular part of the matrix A. If UPLO = 'L', +*> the leading N-by-N lower triangular part of A contains +*> the lower triangular part of the matrix A. +*> +*> On exit, if JOBZ = 'V', then if INFO = 0, A contains the +*> matrix Z of eigenvectors. The eigenvectors are normalized +*> as follows: +*> if ITYPE = 1 or 2, Z**H*B*Z = I; +*> if ITYPE = 3, Z**H*inv(B)*Z = I. +*> If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') +*> or the lower triangle (if UPLO='L') of A, including the +*> diagonal, is destroyed. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB, N) +*> On entry, the Hermitian positive definite matrix B. +*> If UPLO = 'U', the leading N-by-N upper triangular part of B +*> contains the upper triangular part of the matrix B. +*> If UPLO = 'L', the leading N-by-N lower triangular part of B +*> contains the lower triangular part of the matrix B. +*> +*> On exit, if INFO <= N, the part of B containing the matrix is +*> overwritten by the triangular factor U or L from the Cholesky +*> factorization B = U**H*U or B = L*L**H. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] W +*> \verbatim +*> W is DOUBLE PRECISION array, dimension (N) +*> If INFO = 0, the eigenvalues in ascending order. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The length of the array WORK. LWORK >= 1, when N <= 1; +*> otherwise +*> If JOBZ = 'N' and N > 1, LWORK must be queried. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N + N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N + N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> If JOBZ = 'V' and N > 1, LWORK must be queried. Not yet available +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (max(1, 3*N-2)) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: ZPOTRF or ZHEEV returned an error code: +*> <= N: if INFO = i, ZHEEV failed to converge; +*> i off-diagonal elements of an intermediate +*> tridiagonal form did not converge to zero; +*> > N: if INFO = N + i, for 1 <= i <= N, then the leading +*> minor of order i of B is not positive definite. +*> The factorization of B could not be completed and +*> no eigenvalues or eigenvectors were computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complex16HEeigen +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> All details about the 2stage techniques are available in: +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZHEGV_2STAGE( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, + $ WORK, LWORK, RWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK driver routine (version 3.6.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER JOBZ, UPLO + INTEGER INFO, ITYPE, LDA, LDB, LWORK, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION RWORK( * ), W( * ) + COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, UPPER, WANTZ + CHARACTER TRANS + INTEGER NEIG, LWMIN, LHTRD, LWTRD, KD, IB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZHEGST, ZPOTRF, ZTRMM, ZTRSM, + $ ZHEEV_2STAGE +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + WANTZ = LSAME( JOBZ, 'V' ) + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) +* + INFO = 0 + IF( ITYPE.LT.1 .OR. ITYPE.GT.3 ) THEN + INFO = -1 + ELSE IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN + INFO = -2 + ELSE IF( .NOT.( UPPER .OR. LSAME( UPLO, 'L' ) ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -8 + END IF +* + IF( INFO.EQ.0 ) THEN + KD = ILAENV( 17, 'ZHETRD_2STAGE', JOBZ, N, -1, -1, -1 ) + IB = ILAENV( 18, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) + LHTRD = ILAENV( 19, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWTRD = ILAENV( 20, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) + LWMIN = N + LHTRD + LWTRD + WORK( 1 ) = LWMIN +* + IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -11 + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHEGV_2STAGE ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Form a Cholesky factorization of B. +* + CALL ZPOTRF( UPLO, N, B, LDB, INFO ) + IF( INFO.NE.0 ) THEN + INFO = N + INFO + RETURN + END IF +* +* Transform problem to standard eigenvalue problem and solve. +* + CALL ZHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) + CALL ZHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, + $ WORK, LWORK, RWORK, INFO ) +* + IF( WANTZ ) THEN +* +* Backtransform eigenvectors to the original problem. +* + NEIG = N + IF( INFO.GT.0 ) + $ NEIG = INFO - 1 + IF( ITYPE.EQ.1 .OR. ITYPE.EQ.2 ) THEN +* +* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; +* backtransform eigenvectors: x = inv(L)**H *y or inv(U)*y +* + IF( UPPER ) THEN + TRANS = 'N' + ELSE + TRANS = 'C' + END IF +* + CALL ZTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, + $ B, LDB, A, LDA ) +* + ELSE IF( ITYPE.EQ.3 ) THEN +* +* For B*A*x=(lambda)*x; +* backtransform eigenvectors: x = L*y or U**H *y +* + IF( UPPER ) THEN + TRANS = 'C' + ELSE + TRANS = 'N' + END IF +* + CALL ZTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, + $ B, LDB, A, LDA ) + END IF + END IF +* + WORK( 1 ) = LWMIN +* + RETURN +* +* End of ZHEGV_2STAGE +* + END diff --git a/SRC/zhetrd_2stage.f b/SRC/zhetrd_2stage.f new file mode 100644 index 00000000..62fd7539 --- /dev/null +++ b/SRC/zhetrd_2stage.f @@ -0,0 +1,337 @@ +*> \brief \b ZHETRD_2STAGE +* +* @precisions fortran z -> s d c +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZHETRD_2STAGE + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhetrd_2stage.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhetrd_2stage.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhetrd_2stage.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* HOUS2, LHOUS2, WORK, LWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER VECT, UPLO +* INTEGER N, LDA, LWORK, LHOUS2, INFO +* .. +* .. Array Arguments .. +* DOUBLE PRECISION D( * ), E( * ) +* COMPLEX*16 A( LDA, * ), TAU( * ), +* HOUS2( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZHETRD_2STAGE reduces a complex Hermitian matrix A to real symmetric +*> tridiagonal form T by a unitary similarity transformation: +*> Q1**H Q2**H* A * Q2 * Q1 = T. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] VECT +*> \verbatim +*> VECT is CHARACTER*1 +*> = 'N': No need for the Housholder representation, +*> in particular for the second stage (Band to +*> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, +*> then LHOUS2 is to be queried and computed. +*> (NOT AVAILABLE IN THIS RELEASE). +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the leading +*> N-by-N upper triangular part of A contains the upper +*> triangular part of the matrix A, and the strictly lower +*> triangular part of A is not referenced. If UPLO = 'L', the +*> leading N-by-N lower triangular part of A contains the lower +*> triangular part of the matrix A, and the strictly upper +*> triangular part of A is not referenced. +*> On exit, if UPLO = 'U', the band superdiagonal +*> of A are overwritten by the corresponding elements of the +*> internal band-diagonal matrix AB, and the elements above +*> the KD superdiagonal, with the array TAU, represent the unitary +*> matrix Q1 as a product of elementary reflectors; if UPLO +*> = 'L', the diagonal and band subdiagonal of A are over- +*> written by the corresponding elements of the internal band-diagonal +*> matrix AB, and the elements below the KD subdiagonal, with +*> the array TAU, represent the unitary matrix Q1 as a product +*> of elementary reflectors. See Further Details. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is DOUBLE PRECISION array, dimension (N) +*> The diagonal elements of the tridiagonal matrix T. +*> \endverbatim +*> +*> \param[out] E +*> \verbatim +*> E is DOUBLE PRECISION array, dimension (N-1) +*> The off-diagonal elements of the tridiagonal matrix T. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (N-KD) +*> The scalar factors of the elementary reflectors of +*> the first stage (see Further Details). +*> \endverbatim +*> +*> \param[out] HOUS2 +*> \verbatim +*> HOUS2 is COMPLEX*16 array, dimension LHOUS2, that +*> store the Householder representation of the stage2 +*> band to tridiagonal. +*> \endverbatim +*> +*> \param[in] LHOUS2 +*> \verbatim +*> LHOUS2 is INTEGER +*> The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension) +*> If LWORK = -1, or LHOUS2=-1, +*> then a query is assumed; the routine +*> only calculates the optimal size of the HOUS2 array, returns +*> this value as the first entry of the HOUS2 array, and no error +*> message related to LHOUS2 is issued by XERBLA. +*> LHOUS2 = MAX(1, dimension) where +*> dimension = 4*N if VECT='N' +*> not available now if VECT='H' +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> If LWORK = -1, or LHOUS2=-1, +*> then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> LWORK = MAX(1, dimension) where +*> dimension = max(stage1,stage2) + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complex16HEcomputational +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + $ HOUS2, LHOUS2, WORK, LWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER VECT, UPLO + INTEGER N, LDA, LWORK, LHOUS2, INFO +* .. +* .. Array Arguments .. + DOUBLE PRECISION D( * ), E( * ) + COMPLEX*16 A( LDA, * ), TAU( * ), + $ HOUS2( * ), WORK( * ) +* .. +* +* ===================================================================== +* .. +* .. Local Scalars .. + LOGICAL LQUERY, UPPER, WANTQ + INTEGER KD, IB, LWMIN, LHMIN, LWRK, LDAB, WPOS, ABPOS +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZHETRD_HE2HB, ZHETRD_HB2ST +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + WANTQ = LSAME( VECT, 'V' ) + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS2.EQ.-1 ) +* +* Determine the block size, the workspace size and the hous size. +* + KD = ILAENV( 17, 'ZHETRD_2STAGE', VECT, N, -1, -1, -1 ) + IB = ILAENV( 18, 'ZHETRD_2STAGE', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV( 19, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV( 20, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) +* WRITE(*,*),'ZHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, +* $ LHMIN, LWMIN +* + IF( .NOT.LSAME( VECT, 'N' ) ) THEN + INFO = -1 + ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LHOUS2.LT.LHMIN .AND. .NOT.LQUERY ) THEN + INFO = -10 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -12 + END IF +* + IF( INFO.EQ.0 ) THEN + HOUS2( 1 ) = LHMIN + WORK( 1 ) = LWMIN + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHETRD_2STAGE', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* +* Determine pointer position +* + LDAB = KD+1 + LWRK = LWORK-LDAB*N + ABPOS = 1 + WPOS = ABPOS + LDAB*N + CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + $ TAU, WORK( WPOS ), LWRK, INFO ) + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHETRD_HE2HB', -INFO ) + RETURN + END IF + CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, + $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHETRD_HB2ST', -INFO ) + RETURN + END IF +* +* + HOUS2( 1 ) = LHMIN + WORK( 1 ) = LWMIN + RETURN +* +* End of ZHETRD_2STAGE +* + END diff --git a/SRC/zhetrd_hb2st.F b/SRC/zhetrd_hb2st.F new file mode 100644 index 00000000..5d62e30d --- /dev/null +++ b/SRC/zhetrd_hb2st.F @@ -0,0 +1,603 @@ +*> \brief \b ZHBTRD +* +* @precisions fortran z -> s d c +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZHBTRD + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhbtrd.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhbtrd.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhbtrd.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* D, E, HOUS, LHOUS, WORK, LWORK, INFO ) +* +* #define PRECISION_COMPLEX +* +* #if defined(_OPENMP) +* use omp_lib +* #endif +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER STAGE1, UPLO, VECT +* INTEGER N, KD, IB, LDAB, LHOUS, LWORK, INFO +* .. +* .. Array Arguments .. +* DOUBLE PRECISION D( * ), E( * ) +* COMPLEX*16 AB( LDAB, * ), HOUS( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZHBTRD reduces a complex Hermitian band matrix A to real symmetric +*> tridiagonal form T by a unitary similarity transformation: +*> Q**H * A * Q = T. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] STAGE +*> \verbatim +*> STAGE is CHARACTER*1 +*> = 'N': "No": to mention that the stage 1 of the reduction +*> from dense to band using the zhetrd_he2hb routine +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the zhetrd_he2hb +*> routine has been called to produce AB (e.g., AB is +*> the output of zhetrd_he2hb. +*> \endverbatim +*> +*> \param[in] VECT +*> \verbatim +*> VECT is CHARACTER*1 +*> = 'N': No need for the Housholder representation, +*> and thus LHOUS is of size max(1, 4*N); +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, +*> then LHOUS is to be queried and computed. +*> (NOT AVAILABLE IN THIS RELEASE). +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the matrix A if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is COMPLEX*16 array, dimension (LDAB,N) +*> On entry, the upper or lower triangle of the Hermitian band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> On exit, the diagonal elements of AB are overwritten by the +*> diagonal elements of the tridiagonal matrix T; if KD > 0, the +*> elements on the first superdiagonal (if UPLO = 'U') or the +*> first subdiagonal (if UPLO = 'L') are overwritten by the +*> off-diagonal elements of T; the rest of AB is overwritten by +*> values generated during the reduction. +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD+1. +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is DOUBLE PRECISION array, dimension (N) +*> The diagonal elements of the tridiagonal matrix T. +*> \endverbatim +*> +*> \param[out] E +*> \verbatim +*> E is DOUBLE PRECISION array, dimension (N-1) +*> The off-diagonal elements of the tridiagonal matrix T: +*> E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'. +*> \endverbatim +*> +*> \param[out] HOUS +*> \verbatim +*> HOUS is COMPLEX*16 array, dimension LHOUS, that +*> store the Householder representation. +*> \endverbatim +*> +*> \param[in] LHOUS +*> \verbatim +*> LHOUS is INTEGER +*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) +*> If LWORK = -1, or LHOUS=-1, +*> then a query is assumed; the routine +*> only calculates the optimal size of the HOUS array, returns +*> this value as the first entry of the HOUS array, and no error +*> message related to LHOUS is issued by XERBLA. +*> LHOUS = MAX(1, dimension) where +*> dimension = 4*N if VECT='N' +*> not available now if VECT='H' +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> If LWORK = -1, or LHOUS=-1, +*> then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> LWORK = MAX(1, dimension) where +*> dimension = (2KD+1)*N + KD*NTHREADS +*> where KD is the blocking size of the reduction, +*> FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice +*> NTHREADS is the number of threads used when +*> openMP compilation is enabled, otherwise =1. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complex16OTHERcomputational +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) +* +#define PRECISION_COMPLEX +* +#if defined(_OPENMP) + use omp_lib +#endif +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER STAGE1, UPLO, VECT + INTEGER N, KD, LDAB, LHOUS, LWORK, INFO +* .. +* .. Array Arguments .. + DOUBLE PRECISION D( * ), E( * ) + COMPLEX*16 AB( LDAB, * ), HOUS( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION RZERO + COMPLEX*16 ZERO, ONE + PARAMETER ( RZERO = 0.0D+0, + $ ZERO = ( 0.0D+0, 0.0D+0 ), + $ ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, + $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, + $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, + $ SIZEV, SIZETAU, LDV, LHMIN, LWMIN +#if defined (PRECISION_COMPLEX) + DOUBLE PRECISION ABSTMP + COMPLEX*16 TMP +#endif +* .. +* .. External Subroutines .. + EXTERNAL ZHB2ST_KERNELS, ZLACPY, ZLASET +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN, MAX, CEILING, DBLE, REAL +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. Executable Statements .. +* +* Determine the minimal workspace size required. +* Test the input parameters +* + DEBUG = 0 + INFO = 0 + AFTERS1 = LSAME( STAGE1, 'Y' ) + WANTQ = LSAME( VECT, 'V' ) + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LHOUS.EQ.-1 ) +* +* Determine the block size, the workspace size and the hous size. +* + IB = ILAENV( 18, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV( 19, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV( 20, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) +* + IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN + INFO = -1 + ELSE IF( .NOT.LSAME( VECT, 'N' ) ) THEN + INFO = -2 + ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -3 + ELSE IF( N.LT.0 ) THEN + INFO = -4 + ELSE IF( KD.LT.0 ) THEN + INFO = -5 + ELSE IF( LDAB.LT.(KD+1) ) THEN + INFO = -7 + ELSE IF( LHOUS.LT.LHMIN .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -13 + END IF +* + IF( INFO.EQ.0 ) THEN + HOUS( 1 ) = LHMIN + WORK( 1 ) = LWMIN + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHETRD_HB2ST', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + RETURN + END IF +* +* Determine pointer position +* + LDV = KD + IB + SIZETAU = 2 * N + SIZEV = 2 * N + INDTAU = 1 + INDV = INDTAU + SIZETAU + LDA = 2 * KD + 1 + SIZEA = LDA * N + INDA = 1 + INDW = INDA + SIZEA + NTHREADS = 1 + TID = 0 +* + IF( UPPER ) THEN + APOS = INDA + KD + AWPOS = INDA + DPOS = APOS + KD + OFDPOS = DPOS - 1 + ABDPOS = KD + 1 + ABOFDPOS = KD + ELSE + APOS = INDA + AWPOS = INDA + KD + 1 + DPOS = APOS + OFDPOS = DPOS + 1 + ABDPOS = 1 + ABOFDPOS = 2 + + ENDIF +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* complex because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or +* in a parallel environment it might need two cores for D and E +* + IF( KD.EQ.0 ) THEN + DO 30 I = 1, N + D( I ) = DBLE( AB( ABDPOS, I ) ) + 30 CONTINUE + DO 40 I = 1, N-1 + E( I ) = RZERO + 40 CONTINUE + GOTO 200 + END IF +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal +* and offdiagonal elements real, and store them in D and E. +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler coversion formula might be used, but then +* updating the Q matrix will be required and based if Q is generated +* or not this might complicate the story. +* +C IF( KD.EQ.1 .AND. N.GT.(KD+1) .AND. AFTERS1 ) THEN + IF( KD.EQ.1 ) THEN + DO 50 I = 1, N + D( I ) = DBLE( AB( ABDPOS, I ) ) + 50 CONTINUE +#if defined (PRECISION_COMPLEX) +* +* make off-diagonal elements real and copy them to E +* + IF( UPPER ) THEN + DO 60 I = 1, N - 1 + TMP = AB( ABOFDPOS, I+1 ) + ABSTMP = ABS( TMP ) + AB( ABOFDPOS, I+1 ) = ABSTMP + E( I ) = ABSTMP + IF( ABSTMP.NE.RZERO ) THEN + TMP = TMP / ABSTMP + ELSE + TMP = ONE + END IF + IF( I.LT.N-1 ) + $ AB( ABOFDPOS, I+2 ) = AB( ABOFDPOS, I+2 )*TMP +C IF( WANTZ ) THEN +C CALL ZSCAL( N, DCONJG( TMP ), Q( 1, I+1 ), 1 ) +C END IF + 60 CONTINUE + ELSE + DO 70 I = 1, N - 1 + TMP = AB( ABOFDPOS, I ) + ABSTMP = ABS( TMP ) + AB( ABOFDPOS, I ) = ABSTMP + E( I ) = ABSTMP + IF( ABSTMP.NE.RZERO ) THEN + TMP = TMP / ABSTMP + ELSE + TMP = ONE + END IF + IF( I.LT.N-1 ) + $ AB( ABOFDPOS, I+1 ) = AB( ABOFDPOS, I+1 )*TMP +C IF( WANTQ ) THEN +C CALL ZSCAL( N, TMP, Q( 1, I+1 ), 1 ) +C END IF + 70 CONTINUE + ENDIF +#else + IF( UPPER ) THEN + DO 60 I = 1, N-1 + E( I ) = DBLE( AB( ABOFDPOS, I+1 ) ) + 60 CONTINUE + ELSE + DO 70 I = 1, N-1 + E( I ) = DBLE( AB( ABOFDPOS, I ) ) + 70 CONTINUE + ENDIF +#endif + GOTO 200 + END IF +* +* Main code start here. +* Reduce the hermitian band of A to a tridiagonal matrix. +* + THGRSIZ = N + GRSIZ = 1 + SHIFT = 3 + NBTILES = CEILING( REAL(N)/REAL(KD) ) + STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) + THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) +* + CALL ZLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) + CALL ZLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) +* +* +* openMP parallelisation start here +* +#if defined(_OPENMP) +!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) +!$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) +!$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) +!$OMP$ SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT ) +!$OMP MASTER +#endif +* +* main bulge chasing loop +* + DO 100 THGRID = 1, THGRNB + STT = (THGRID-1)*THGRSIZ+1 + THED = MIN( (STT + THGRSIZ -1), (N-1)) + DO 110 I = STT, N-1 + ED = MIN( I, THED ) + IF( STT.GT.ED ) GOTO 100 + DO 120 M = 1, STEPERCOL + ST = STT + DO 130 SWEEPID = ST, ED + DO 140 K = 1, GRSIZ + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + $ + (M-1)*GRSIZ + K + IF ( MYID.EQ.1 ) THEN + TTYPE = 1 + ELSE + TTYPE = MOD( MYID, 2 ) + 2 + ENDIF + + IF( TTYPE.EQ.2 ) THEN + COLPT = (MYID/2)*KD + SWEEPID + STIND = COLPT-KD+1 + EDIND = MIN(COLPT,N) + BLKLASTIND = COLPT + ELSE + COLPT = ((MYID+1)/2)*KD + SWEEPID + STIND = COLPT-KD+1 + EDIND = MIN(COLPT,N) + IF( ( STIND.GE.EDIND-1 ).AND. + $ ( EDIND.EQ.N ) ) THEN + BLKLASTIND = N + ELSE + BLKLASTIND = 0 + ENDIF + ENDIF +* +* Call the kernel +* +#if defined(_OPENMP) + IF( TTYPE.NE.1 ) THEN +!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) +!$OMP$ DEPEND(in:WORK(MYID-1)) +!$OMP$ DEPEND(out:WORK(MYID)) + TID = OMP_GET_THREAD_NUM() + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + $ STIND, EDIND, SWEEPID, N, KD, IB, + $ WORK ( INDA ), LDA, + $ HOUS( INDV ), HOUS( INDTAU ), LDV, + $ WORK( INDW + TID*KD ) ) +!$OMP END TASK + ELSE +!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) +!$OMP$ DEPEND(out:WORK(MYID)) + TID = OMP_GET_THREAD_NUM() + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + $ STIND, EDIND, SWEEPID, N, KD, IB, + $ WORK ( INDA ), LDA, + $ HOUS( INDV ), HOUS( INDTAU ), LDV, + $ WORK( INDW + TID*KD ) ) +!$OMP END TASK + ENDIF +#else + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + $ STIND, EDIND, SWEEPID, N, KD, IB, + $ WORK ( INDA ), LDA, + $ HOUS( INDV ), HOUS( INDTAU ), LDV, + $ WORK( INDW + TID*KD ) ) +#endif + IF ( BLKLASTIND.GE.(N-1) ) THEN + STT = STT + 1 + GOTO 130 + ENDIF + 140 CONTINUE + 130 CONTINUE + 120 CONTINUE + 110 CONTINUE + 100 CONTINUE +* +#if defined(_OPENMP) +!$OMP END MASTER +!$OMP END PARALLEL +#endif +* +* Copy the diagonal from A to D. Note that D is REAL thus only +* the Real part is needed, the imaginary part should be zero. +* + DO 150 I = 1, N + D( I ) = DBLE( WORK( DPOS+(I-1)*LDA ) ) + 150 CONTINUE +* +* Copy the off diagonal from A to E. Note that E is REAL thus only +* the Real part is needed, the imaginary part should be zero. +* + IF( UPPER ) THEN + DO 160 I = 1, N-1 + E( I ) = DBLE( WORK( OFDPOS+I*LDA ) ) + 160 CONTINUE + ELSE + DO 170 I = 1, N-1 + E( I ) = DBLE( WORK( OFDPOS+(I-1)*LDA ) ) + 170 CONTINUE + ENDIF +* + 200 CONTINUE +* + HOUS( 1 ) = LHMIN + WORK( 1 ) = LWMIN + RETURN +* +* End of ZHETRD_HB2ST +* + END +#undef PRECISION_COMPLEX + diff --git a/SRC/zhetrd_he2hb.f b/SRC/zhetrd_he2hb.f new file mode 100644 index 00000000..9403b73e --- /dev/null +++ b/SRC/zhetrd_he2hb.f @@ -0,0 +1,517 @@ +*> \brief \b ZHETRD_HE2HB +* +* @precisions fortran z -> s d c +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZHETRD_HE2HB + dependencies +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhetrd.f"> +*> [TGZ]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhetrd.f"> +*> [ZIP]</a> +*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhetrd.f"> +*> [TXT]</a> +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZHETRD_HE2HB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, +* WORK, LWORK, INFO ) +* +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER UPLO +* INTEGER INFO, LDA, LDAB, LWORK, N, KD +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), AB( LDAB, * ), +* TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZHETRD_HE2HB reduces a complex Hermitian matrix A to complex Hermitian +*> band-diagonal form AB by a unitary similarity transformation: +*> Q**H * A * Q = AB. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> = 'U': Upper triangle of A is stored; +*> = 'L': Lower triangle of A is stored. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KD +*> \verbatim +*> KD is INTEGER +*> The number of superdiagonals of the reduced matrix if UPLO = 'U', +*> or the number of subdiagonals if UPLO = 'L'. KD >= 0. +*> The reduced matrix is stored in the array AB. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> On entry, the Hermitian matrix A. If UPLO = 'U', the leading +*> N-by-N upper triangular part of A contains the upper +*> triangular part of the matrix A, and the strictly lower +*> triangular part of A is not referenced. If UPLO = 'L', the +*> leading N-by-N lower triangular part of A contains the lower +*> triangular part of the matrix A, and the strictly upper +*> triangular part of A is not referenced. +*> On exit, if UPLO = 'U', the diagonal and first superdiagonal +*> of A are overwritten by the corresponding elements of the +*> tridiagonal matrix T, and the elements above the first +*> superdiagonal, with the array TAU, represent the unitary +*> matrix Q as a product of elementary reflectors; if UPLO +*> = 'L', the diagonal and first subdiagonal of A are over- +*> written by the corresponding elements of the tridiagonal +*> matrix T, and the elements below the first subdiagonal, with +*> the array TAU, represent the unitary matrix Q as a product +*> of elementary reflectors. See Further Details. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[out] AB +*> \verbatim +*> AB is COMPLEX*16 array, dimension (LDAB,N) +*> On exit, the upper or lower triangle of the Hermitian band +*> matrix A, stored in the first KD+1 rows of the array. The +*> j-th column of A is stored in the j-th column of the array AB +*> as follows: +*> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; +*> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KD+1. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (N-KD) +*> The scalar factors of the elementary reflectors (see Further +*> Details). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension LWORK. +*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK(1) returns the size of LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK which should be calculated +* by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> LWORK_QUERY = N*KD + N*max(KD,FACTOPTNB) + 2*KD*KD +*> where FACTOPTNB is the blocking used by the QR or LQ +*> algorithm, usually FACTOPTNB=128 is a good choice otherwise +*> putting LWORK=-1 will provide the size of WORK. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2016 +* +*> \ingroup complex16HEcomputational +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> Implemented by Azzam Haidar. +*> +*> All details are available on technical report, SC11, SC13 papers. +*> +*> Azzam Haidar, Hatem Ltaief, and Jack Dongarra. +*> Parallel reduction to condensed forms for symmetric eigenvalue problems +*> using aggregated fine-grained and memory-aware kernels. In Proceedings +*> of 2011 International Conference for High Performance Computing, +*> Networking, Storage and Analysis (SC '11), New York, NY, USA, +*> Article 8 , 11 pages. +*> http://doi.acm.org/10.1145/2063384.2063394 +*> +*> A. Haidar, J. Kurzak, P. Luszczek, 2013. +*> An improved parallel singular value algorithm and its implementation +*> for multicore hardware, In Proceedings of 2013 International Conference +*> for High Performance Computing, Networking, Storage and Analysis (SC '13). +*> Denver, Colorado, USA, 2013. +*> Article 90, 12 pages. +*> http://doi.acm.org/10.1145/2503210.2503292 +*> +*> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> calculations based on fine-grained memory aware tasks. +*> International Journal of High Performance Computing Applications. +*> Volume 28 Issue 2, Pages 196-209, May 2014. +*> http://hpc.sagepub.com/content/28/2/196 +*> +*> \endverbatim +*> +*> \verbatim +*> +*> If UPLO = 'U', the matrix Q is represented as a product of elementary +*> reflectors +*> +*> Q = H(k)**H . . . H(2)**H H(1)**H, where k = n-kd. +*> +*> Each H(i) has the form +*> +*> H(i) = I - tau * v * v**H +*> +*> where tau is a complex scalar, and v is a complex vector with +*> v(1:i+kd-1) = 0 and v(i+kd) = 1; conjg(v(i+kd+1:n)) is stored on exit in +*> A(i,i+kd+1:n), and tau in TAU(i). +*> +*> If UPLO = 'L', the matrix Q is represented as a product of elementary +*> reflectors +*> +*> Q = H(1) H(2) . . . H(k), where k = n-kd. +*> +*> Each H(i) has the form +*> +*> H(i) = I - tau * v * v**H +*> +*> where tau is a complex scalar, and v is a complex vector with +*> v(kd+1:i) = 0 and v(i+kd+1) = 1; v(i+kd+2:n) is stored on exit in +* A(i+kd+2:n,i), and tau in TAU(i). +*> +*> The contents of A on exit are illustrated by the following examples +*> with n = 5: +*> +*> if UPLO = 'U': if UPLO = 'L': +*> +*> ( ab ab/v1 v1 v1 v1 ) ( ab ) +*> ( ab ab/v2 v2 v2 ) ( ab/v1 ab ) +*> ( ab ab/v3 v3 ) ( v1 ab/v2 ab ) +*> ( ab ab/v4 ) ( v1 v2 ab/v3 ab ) +*> ( ab ) ( v1 v2 v3 ab/v4 ab ) +*> +*> where d and e denote diagonal and off-diagonal elements of T, and vi +*> denotes an element of the vector defining H(i). +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZHETRD_HE2HB( UPLO, N, KD, A, LDA, AB, LDAB, TAU, + $ WORK, LWORK, INFO ) +* + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2016 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, LDAB, LWORK, N, KD +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), AB( LDAB, * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION RONE + COMPLEX*16 ZERO, ONE, HALF + PARAMETER ( RONE = 1.0D+0, + $ ZERO = ( 0.0D+0, 0.0D+0 ), + $ ONE = ( 1.0D+0, 0.0D+0 ), + $ HALF = ( 0.5D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, UPPER + INTEGER I, J, IINFO, LWMIN, PN, PK, LK, + $ LDT, LDW, LDS2, LDS1, + $ LS2, LS1, LW, LT, + $ TPOS, WPOS, S2POS, S1POS +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZHER2K, ZHEMM, ZGEMM, + $ ZLARFT, ZGELQF, ZGEQRF, ZLASET +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN, MAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. Executable Statements .. +* +* Determine the minimal workspace size required +* and test the input parameters +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + LQUERY = ( LWORK.EQ.-1 ) + LWMIN = ILAENV( 20, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) + + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( KD.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LDAB.LT.MAX( 1, KD+1 ) ) THEN + INFO = -7 + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHETRD_HE2HB', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = LWMIN + RETURN + END IF +* +* Quick return if possible +* Copy the upper/lower portion of A into AB +* + IF( N.LE.KD+1 ) THEN + IF( UPPER ) THEN + DO 100 I = 1, N + LK = MIN( KD+1, I ) + CALL ZCOPY( LK, A( I-LK+1, I ), 1, + $ AB( KD+1-LK+1, I ), 1 ) + 100 CONTINUE + ELSE + DO 110 I = 1, N + LK = MIN( KD+1, N-I+1 ) + CALL ZCOPY( LK, A( I, I ), 1, AB( 1, I ), 1 ) + 110 CONTINUE + ENDIF + WORK( 1 ) = 1 + RETURN + END IF +* +* Determine the pointer position for the workspace +* + LDT = KD + LDS1 = KD + LT = LDT*KD + LW = N*KD + LS1 = LDS1*KD + LS2 = LWMIN - LT - LW - LS1 +* LS2 = N*MAX(KD,FACTOPTNB) + TPOS = 1 + WPOS = TPOS + LT + S1POS = WPOS + LW + S2POS = S1POS + LS1 + IF( UPPER ) THEN + LDW = KD + LDS2 = KD + ELSE + LDW = N + LDS2 = N + ENDIF +* +* +* Set the workspace of the triangular matrix T to zero once such a +* way everytime T is generated the upper/lower portion will be always zero +* + CALL ZLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT ) +* + IF( UPPER ) THEN + DO 10 I = 1, N - KD, KD + PN = N-I-KD+1 + PK = MIN( N-I-KD+1, KD ) +* +* Compute the LQ factorization of the current block +* + CALL ZGELQF( KD, PN, A( I, I+KD ), LDA, + $ TAU( I ), WORK( S2POS ), LS2, IINFO ) +* +* Copy the upper portion of A into AB +* + DO 20 J = I, I+PK-1 + LK = MIN( KD, N-J ) + 1 + CALL ZCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 ) + 20 CONTINUE +* + CALL ZLASET( 'Lower', PK, PK, ZERO, ONE, + $ A( I, I+KD ), LDA ) +* +* Form the matrix T +* + CALL ZLARFT( 'Forward', 'Rowwise', PN, PK, + $ A( I, I+KD ), LDA, TAU( I ), + $ WORK( TPOS ), LDT ) +* +* Compute W: +* + CALL ZGEMM( 'Conjugate', 'No transpose', PK, PN, PK, + $ ONE, WORK( TPOS ), LDT, + $ A( I, I+KD ), LDA, + $ ZERO, WORK( S2POS ), LDS2 ) +* + CALL ZHEMM( 'Right', UPLO, PK, PN, + $ ONE, A( I+KD, I+KD ), LDA, + $ WORK( S2POS ), LDS2, + $ ZERO, WORK( WPOS ), LDW ) +* + CALL ZGEMM( 'No transpose', 'Conjugate', PK, PK, PN, + $ ONE, WORK( WPOS ), LDW, + $ WORK( S2POS ), LDS2, + $ ZERO, WORK( S1POS ), LDS1 ) +* + CALL ZGEMM( 'No transpose', 'No transpose', PK, PN, PK, + $ -HALF, WORK( S1POS ), LDS1, + $ A( I, I+KD ), LDA, + $ ONE, WORK( WPOS ), LDW ) +* +* +* Update the unreduced submatrix A(i+kd:n,i+kd:n), using +* an update of the form: A := A - V'*W - W'*V +* + CALL ZHER2K( UPLO, 'Conjugate', PN, PK, + $ -ONE, A( I, I+KD ), LDA, + $ WORK( WPOS ), LDW, + $ RONE, A( I+KD, I+KD ), LDA ) + 10 CONTINUE +* +* Copy the upper band to AB which is the band storage matrix +* + DO 30 J = N-KD+1, N + LK = MIN(KD, N-J) + 1 + CALL ZCOPY( LK, A( J, J ), LDA, AB( KD+1, J ), LDAB-1 ) + 30 CONTINUE +* + ELSE +* +* Reduce the lower triangle of A to lower band matrix +* + DO 40 I = 1, N - KD, KD + PN = N-I-KD+1 + PK = MIN( N-I-KD+1, KD ) +* +* Compute the QR factorization of the current block +* + CALL ZGEQRF( PN, KD, A( I+KD, I ), LDA, + $ TAU( I ), WORK( S2POS ), LS2, IINFO ) +* +* Copy the upper portion of A into AB +* + DO 50 J = I, I+PK-1 + LK = MIN( KD, N-J ) + 1 + CALL ZCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 ) + 50 CONTINUE +* + CALL ZLASET( 'Upper', PK, PK, ZERO, ONE, + $ A( I+KD, I ), LDA ) +* +* Form the matrix T +* + CALL ZLARFT( 'Forward', 'Columnwise', PN, PK, + $ A( I+KD, I ), LDA, TAU( I ), + $ WORK( TPOS ), LDT ) +* +* Compute W: +* + CALL ZGEMM( 'No transpose', 'No transpose', PN, PK, PK, + $ ONE, A( I+KD, I ), LDA, + $ WORK( TPOS ), LDT, + $ ZERO, WORK( S2POS ), LDS2 ) +* + CALL ZHEMM( 'Left', UPLO, PN, PK, + $ ONE, A( I+KD, I+KD ), LDA, + $ WORK( S2POS ), LDS2, + $ ZERO, WORK( WPOS ), LDW ) +* + CALL ZGEMM( 'Conjugate', 'No transpose', PK, PK, PN, + $ ONE, WORK( S2POS ), LDS2, + $ WORK( WPOS ), LDW, + $ ZERO, WORK( S1POS ), LDS1 ) +* + CALL ZGEMM( 'No transpose', 'No transpose', PN, PK, PK, + $ -HALF, A( I+KD, I ), LDA, + $ WORK( S1POS ), LDS1, + $ ONE, WORK( WPOS ), LDW ) +* +* +* Update the unreduced submatrix A(i+kd:n,i+kd:n), using +* an update of the form: A := A - V*W' - W*V' +* + CALL ZHER2K( UPLO, 'No transpose', PN, PK, + $ -ONE, A( I+KD, I ), LDA, + $ WORK( WPOS ), LDW, + $ RONE, A( I+KD, I+KD ), LDA ) +* ================================================================== +* RESTORE A FOR COMPARISON AND CHECKING TO BE REMOVED +* DO 45 J = I, I+PK-1 +* LK = MIN( KD, N-J ) + 1 +* CALL ZCOPY( LK, AB( 1, J ), 1, A( J, J ), 1 ) +* 45 CONTINUE +* ================================================================== + 40 CONTINUE +* +* Copy the lower band to AB which is the band storage matrix +* + DO 60 J = N-KD+1, N + LK = MIN(KD, N-J) + 1 + CALL ZCOPY( LK, A( J, J ), 1, AB( 1, J ), 1 ) + 60 CONTINUE + + END IF +* + WORK( 1 ) = LWMIN + RETURN +* +* End of ZHETRD_HE2HB +* + END diff --git a/SRC/zlarfy.f b/SRC/zlarfy.f new file mode 100644 index 00000000..39b795f0 --- /dev/null +++ b/SRC/zlarfy.f @@ -0,0 +1,163 @@ +*> \brief \b ZLARFY +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK ) +* +* .. Scalar Arguments .. +* CHARACTER UPLO +* INTEGER INCV, LDC, N +* COMPLEX*16 TAU +* .. +* .. Array Arguments .. +* COMPLEX*16 C( LDC, * ), V( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLARFY applies an elementary reflector, or Householder matrix, H, +*> to an n x n Hermitian matrix C, from both the left and the right. +*> +*> H is represented in the form +*> +*> H = I - tau * v * v' +*> +*> where tau is a scalar and v is a vector. +*> +*> If tau is zero, then H is taken to be the unit matrix. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the upper or lower triangular part of the +*> Hermitian matrix C is stored. +*> = 'U': Upper triangle +*> = 'L': Lower triangle +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of rows and columns of the matrix C. N >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is COMPLEX*16 array, dimension +*> (1 + (N-1)*abs(INCV)) +*> The vector v as described above. +*> \endverbatim +*> +*> \param[in] INCV +*> \verbatim +*> INCV is INTEGER +*> The increment between successive elements of v. INCV must +*> not be zero. +*> \endverbatim +*> +*> \param[in] TAU +*> \verbatim +*> TAU is COMPLEX*16 +*> The value tau as described above. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX*16 array, dimension (LDC, N) +*> On entry, the matrix C. +*> On exit, C is overwritten by H * C * H'. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max( 1, N ). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (N) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2011 +* +*> \ingroup complex16_eig +* +* ===================================================================== + SUBROUTINE ZLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK ) +* +* -- LAPACK test routine (version 3.4.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2011 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCV, LDC, N + COMPLEX*16 TAU +* .. +* .. Array Arguments .. + COMPLEX*16 C( LDC, * ), V( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE, ZERO, HALF + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), + $ ZERO = ( 0.0D+0, 0.0D+0 ), + $ HALF = ( 0.5D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + COMPLEX*16 ALPHA +* .. +* .. External Subroutines .. + EXTERNAL ZAXPY, ZHEMV, ZHER2 +* .. +* .. External Functions .. + COMPLEX*16 ZDOTC + EXTERNAL ZDOTC +* .. +* .. Executable Statements .. +* + IF( TAU.EQ.ZERO ) + $ RETURN +* +* Form w:= C * v +* + CALL ZHEMV( UPLO, N, ONE, C, LDC, V, INCV, ZERO, WORK, 1 ) +* + ALPHA = -HALF*TAU*ZDOTC( N, WORK, 1, V, INCV ) + CALL ZAXPY( N, ALPHA, V, INCV, WORK, 1 ) +* +* C := C - v * w' - w * v' +* + CALL ZHER2( UPLO, N, -TAU, V, INCV, WORK, 1, C, LDC ) +* + RETURN +* +* End of ZLARFY +* + END |