Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
089ba660
Commit
089ba660
authored
Jan 28, 2017
by
Andreas Marek
Browse files
Cleanup
parent
58923f0f
Changes
12
Pipelines
1
Show whitespace changes
Inline
Side-by-side
src/elpa1_compute_private.F90
View file @
089ba660
...
@@ -160,6 +160,11 @@ module ELPA1_COMPUTE
...
@@ -160,6 +160,11 @@ module ELPA1_COMPUTE
#define BYTESIZE 8
#define BYTESIZE 8
#define REALCASE 1
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION double
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#include "elpa_transpose_vectors.X90"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
...
@@ -167,7 +172,8 @@ module ELPA1_COMPUTE
...
@@ -167,7 +172,8 @@ module ELPA1_COMPUTE
#undef BYTESIZE
#undef BYTESIZE
#undef REALCASE
#undef REALCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION
#undef PRECISION
#undef PRECISION_SUFFIX
! single precision
! single precision
#ifdef WANT_SINGLE_PRECISION_REAL
#ifdef WANT_SINGLE_PRECISION_REAL
...
@@ -176,12 +182,19 @@ module ELPA1_COMPUTE
...
@@ -176,12 +182,19 @@ module ELPA1_COMPUTE
#define BYTESIZE 4
#define BYTESIZE 4
#define REALCASE 1
#define REALCASE 1
#define SINGLE_PRECISION 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION single
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#include "elpa_transpose_vectors.X90"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef DATATYPE
#undef BYTESIZE
#undef BYTESIZE
#undef REALCASE
#undef REALCASE
#undef SINGLE_PRECISION
#undef SINGLE_PRECISION
#undef PRECISION
#undef PRECISION_SUFFIX
#endif
#endif
...
@@ -192,6 +205,11 @@ module ELPA1_COMPUTE
...
@@ -192,6 +205,11 @@ module ELPA1_COMPUTE
#define BYTESIZE 16
#define BYTESIZE 16
#define COMPLEXCASE 1
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION double
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#include "elpa_transpose_vectors.X90"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef DATATYPE
...
@@ -199,6 +217,8 @@ module ELPA1_COMPUTE
...
@@ -199,6 +217,8 @@ module ELPA1_COMPUTE
#undef COMPLEXCASE
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#undef PRECISION
#undef PRECISION_SUFFIX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
...
@@ -207,6 +227,11 @@ module ELPA1_COMPUTE
...
@@ -207,6 +227,11 @@ module ELPA1_COMPUTE
#define DATATYPE COMPLEX(kind=ck4)
#define DATATYPE COMPLEX(kind=ck4)
#define COMPLEXCASE 1
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION single
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#include "elpa_transpose_vectors.X90"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef DATATYPE
...
...
src/elpa1_merge_systems_real_template.X90
View file @
089ba660
...
@@ -120,8 +120,7 @@
...
@@ -120,8 +120,7 @@
endif
endif
#endif
#endif
call timer%start("merge_systems" // &
call timer%start("merge_systems" // PRECISION_SUFFIX)
&PRECISION_SUFFIX)
success = .true.
success = .true.
call timer%start("mpi_communication")
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
...
...
src/elpa1_tools_template.X90
View file @
089ba660
...
@@ -244,7 +244,7 @@
...
@@ -244,7 +244,7 @@
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
subroutine hh_transform_complex_&
subroutine hh_transform_complex_&
#endif
#endif
&PRECISION &
&PRECISION &
(alpha, xnorm_sq, xf, tau)
(alpha, xnorm_sq, xf, tau)
#if REALCASE == 1
#if REALCASE == 1
! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
...
@@ -282,8 +282,7 @@
...
@@ -282,8 +282,7 @@
call timer%start("hh_tranform_&
call timer%start("hh_tranform_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&" // &
&" // &
&PRECISION_SUFFIX &
&PRECISION_SUFFIX )
)
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
ALPHR = real( ALPHA, kind=REAL_DATATYPE )
ALPHR = real( ALPHA, kind=REAL_DATATYPE )
...
@@ -347,8 +346,7 @@
...
@@ -347,8 +346,7 @@
call timer%stop("hh_transform_&
call timer%stop("hh_transform_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&" // &
&" // &
&PRECISION_SUFFIX &
&PRECISION_SUFFIX )
)
#if REALCASE == 1
#if REALCASE == 1
end subroutine hh_transform_real_&
end subroutine hh_transform_real_&
...
@@ -356,4 +354,4 @@
...
@@ -356,4 +354,4 @@
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
end subroutine hh_transform_complex_&
end subroutine hh_transform_complex_&
#endif
#endif
&PRECISION
&PRECISION
src/elpa2_bandred_template.X90
View file @
089ba660
...
@@ -258,7 +258,7 @@
...
@@ -258,7 +258,7 @@
na_rows = na
na_rows = na
#endif
#endif
na_cols = na
na_cols = na
#endif
#endif
/* WITH_MPI */
! Here we convert the regular host array into a pinned host array
! Here we convert the regular host array into a pinned host array
successCUDA = cuda_malloc(a_dev, lda*na_cols* &
successCUDA = cuda_malloc(a_dev, lda*na_cols* &
...
@@ -1326,9 +1326,9 @@
...
@@ -1326,9 +1326,9 @@
if (tile_size < istep*nbw) then
if (tile_size < istep*nbw) then
call elpa_reduce_add_vectors_&
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&_&
&_&
&PRECISION &
&PRECISION &
(vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, &
(vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, &
umcCUDA, cur_l_cols, mpi_comm_cols, &
umcCUDA, cur_l_cols, mpi_comm_cols, &
istep*nbw, n_cols, nblk)
istep*nbw, n_cols, nblk)
...
@@ -1401,7 +1401,7 @@
...
@@ -1401,7 +1401,7 @@
endif
endif
call symm_matrix_allreduce_&
call symm_matrix_allreduce_&
&PRECISION &
&PRECISION &
(n_cols,vav, nbw,nbw,mpi_comm_cols)
(n_cols,vav, nbw,nbw,mpi_comm_cols)
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
...
@@ -1427,9 +1427,9 @@
...
@@ -1427,9 +1427,9 @@
! Transpose umc -> umr (stored in vmr, second half)
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_&
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&_&
&_&
&PRECISION &
&PRECISION &
(umcCUDA, cur_l_cols, mpi_comm_cols, &
(umcCUDA, cur_l_cols, mpi_comm_cols, &
vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
1, istep*nbw, n_cols, nblk)
...
@@ -1466,9 +1466,9 @@
...
@@ -1466,9 +1466,9 @@
! Or if we used the Algorithm 4
! Or if we used the Algorithm 4
if (tile_size < istep*nbw .or. n_way > 1) then
if (tile_size < istep*nbw .or. n_way > 1) then
call elpa_reduce_add_vectors_&
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&_&
&_&
&PRECISION &
&PRECISION &
(vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
(vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
istep*nbw, n_cols, nblk)
...
@@ -1512,7 +1512,7 @@
...
@@ -1512,7 +1512,7 @@
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
call timer%stop("blas")
call timer%stop("blas")
call symm_matrix_allreduce_&
call symm_matrix_allreduce_&
&PRECISION &
&PRECISION &
(n_cols,vav, nbw, nbw ,mpi_comm_cols)
(n_cols,vav, nbw, nbw ,mpi_comm_cols)
! U = U - 0.5 * V * VAV
! U = U - 0.5 * V * VAV
...
@@ -1522,9 +1522,9 @@
...
@@ -1522,9 +1522,9 @@
call timer%stop("blas")
call timer%stop("blas")
! Transpose umc -> umr (stored in vmr, second half)
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_&
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&_&
&_&
&PRECISION &
&PRECISION &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
1, istep*nbw, n_cols, nblk)
...
@@ -1588,9 +1588,9 @@
...
@@ -1588,9 +1588,9 @@
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
if (tile_size < istep*nbw) then
if (tile_size < istep*nbw) then
call elpa_reduce_add_vectors_&
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&_&
&_&
&PRECISION &
&PRECISION &
(vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
(vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
istep*nbw, n_cols, nblk)
...
@@ -1683,7 +1683,7 @@
...
@@ -1683,7 +1683,7 @@
endif
endif
call herm_matrix_allreduce_&
call herm_matrix_allreduce_&
&PRECISION &
&PRECISION &
(n_cols,vav, nbw, nbw,mpi_comm_cols)
(n_cols,vav, nbw, nbw,mpi_comm_cols)
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
...
@@ -1699,7 +1699,7 @@
...
@@ -1699,7 +1699,7 @@
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
call timer%stop("blas")
call timer%stop("blas")
call herm_matrix_allreduce_&
call herm_matrix_allreduce_&
&PRECISION &
&PRECISION &
(n_cols,vav,nbw,nbw,mpi_comm_cols)
(n_cols,vav,nbw,nbw,mpi_comm_cols)
endif
endif
...
@@ -1723,9 +1723,9 @@
...
@@ -1723,9 +1723,9 @@
stop
stop
endif
endif
call elpa_transpose_vectors_&
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&_&
&_&
&PRECISION &
&PRECISION &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
1, istep*nbw, n_cols, nblk)
...
@@ -1755,9 +1755,9 @@
...
@@ -1755,9 +1755,9 @@
call timer%stop("blas")
call timer%stop("blas")
! Transpose umc -> umr (stored in vmr, second half)
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_&
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&_&
&_&
&PRECISION &
&PRECISION &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
1, istep*nbw, n_cols, nblk)
...
@@ -1826,12 +1826,22 @@
...
@@ -1826,12 +1826,22 @@
! endif
! endif
!#endif
!#endif
! this is not needed since a_dev is passed along from one subroutine to the other
! this is not needed since a_dev is passed along from one subroutine to the other
successCUDA = cuda_memcpy ( &
#if REALCASE == 1
#if REALCASE == 1
successCUDA = cuda_memcpy (
loc
(a),
a_dev, lda*na_cols*size_of_PRECISION_real,cudaMemcpyDeviceToHost)
loc(a),
&
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy ( loc(a(1,1)), a_dev, lda*na_cols*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
loc(a(1,1)), &
#endif
a_dev, lda*na_cols* &
#if REALCASE == 1
size_of_PRECISION_real, &
#endif
#if COMPLEXCASE ==1
size_of_PRECISION_complex,&
#endif
#endif
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
if (.not.(successCUDA)) then
print *,"bandred_&
print *,"bandred_&
&MATH_DATATYPE&
&MATH_DATATYPE&
...
...
src/elpa2_compute.F90
View file @
089ba660
...
@@ -116,25 +116,43 @@ module ELPA2_compute
...
@@ -116,25 +116,43 @@ module ELPA2_compute
#define REAL_DATATYPE rk8
#define REAL_DATATYPE rk8
#define BYTESIZE 8
#define BYTESIZE 8
#define REALCASE 1
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#define PRECISION double
#include "redist_band.X90"
#include "redist_band.X90"
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef REAL_DATATYPE
#undef BYTESIZE
#undef BYTESIZE
#undef REALCASE
#undef REALCASE
#undef DOUBLE_PRECISION
#undef PRECISION_SUFFIX
#undef PRECISION
! single precision
! single precision
#ifdef WANT_SINGLE_PRECISION_REAL
#ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION
#define REAL_DATATYPE rk4
#define REAL_DATATYPE rk4
#define BYTESIZE 4
#define BYTESIZE 4
#define REALCASE 1
#define REALCASE 1
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#undef PRECISION
#define PRECISION single
#include "redist_band.X90"
#include "redist_band.X90"
#undef REAL_DATATYPE
#undef REAL_DATATYPE
#undef BYTESIZE
#undef BYTESIZE
#undef REALCASE
#undef REALCASE
#undef PRECISION_SUFFIX
#undef PRECISION
#endif
#endif
/* WANT_SINGLE_PRECISION_REAL */
! double precision
! double precision
#define DOUBLE_PRECISION_COMPLEX 1
#define DOUBLE_PRECISION_COMPLEX 1
...
@@ -142,22 +160,40 @@ module ELPA2_compute
...
@@ -142,22 +160,40 @@ module ELPA2_compute
#define COMPLEX_DATATYPE ck8
#define COMPLEX_DATATYPE ck8
#define BYTESIZE 16
#define BYTESIZE 16
#define COMPLEXCASE 1
#define COMPLEXCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#undef PRECISION
#define PRECISION double
#include "redist_band.X90"
#include "redist_band.X90"
#undef COMPLEX_DATATYPE
#undef COMPLEX_DATATYPE
#undef BYTESIZE
#undef BYTESIZE
#undef COMPLEXCASE
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#undef PRECISION_SUFFIX
#undef PRECISION
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION
#define COMPLEX_DATATYPE ck4
#define COMPLEX_DATATYPE ck4
#define COMPLEXCASE 1
#define COMPLEXCASE 1
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#undef PRECISION
#define PRECISION single
#include "redist_band.X90"
#include "redist_band.X90"
#undef COMPLEX_DATATYPE
#undef COMPLEX_DATATYPE
#undef BYTESIZE
#undef BYTESIZE
#undef COMPLEXCASE
#undef COMPLEXCASE
#undef PRECISION_SUFFIX
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
...
...
src/elpa2_compute_real_template.X90
View file @
089ba660
...
@@ -141,22 +141,6 @@
...
@@ -141,22 +141,6 @@
character(200) :: errorMessage
character(200) :: errorMessage
call timer%start("band_band_real" // PRECISION_SUFFIX)
call timer%start("band_band_real" // PRECISION_SUFFIX)
! if (na .lt. 2*nb) then
! print *,"na lt 2*nb ",na,2*nb
! stop
! endif
! if (na .lt. 2*nb2) then
! print *,"na lt 2*nb2 ",na,2*nb2
! stop
! endif
! if (na .lt. nbCol) then
! print *,"na lt nbCol ",na,nbCol
! stop
! endif
! if (na .lt. nb2Col) then
! print *,"na lt nb2Col ",na,nb2Col
! stop
! endif
call timer%start("mpi_communication")
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
...
...
src/elpa2_template.X90
View file @
089ba660
...
@@ -69,7 +69,7 @@
...
@@ -69,7 +69,7 @@
call timer%start("solve_evp_&
call timer%start("solve_evp_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&_2stage
_
" // &
&_2stage" // &
&PRECISION_SUFFIX &
&PRECISION_SUFFIX &
)
)
...
@@ -483,7 +483,7 @@
...
@@ -483,7 +483,7 @@
call timer%stop("solve_evp_&
call timer%stop("solve_evp_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&_2stage
_
" // &
&_2stage" // &
&PRECISION_SUFFIX &
&PRECISION_SUFFIX &
)
)
1 format(a,f10.3)
1 format(a,f10.3)
...
...
src/elpa2_trans_ev_tridi_to_band_template.X90
View file @
089ba660
...
@@ -238,7 +238,7 @@
...
@@ -238,7 +238,7 @@
call timer%start("trans_ev_tridi_to_band_&
call timer%start("trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&
_
" // &
&" // &
&PRECISION_SUFFIX &
&PRECISION_SUFFIX &
)
)
...
@@ -275,7 +275,7 @@
...
@@ -275,7 +275,7 @@
na_cols = na
na_cols = na
#endif
#endif
endif
endif
#endif /* COMPLEXCASE
c
*/
#endif /* COMPLEXCASE
*/
if (mod(nbw,nblk)/=0) then
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (my_prow==0 .and. my_pcol==0) then
...
@@ -370,7 +370,9 @@
...
@@ -370,7 +370,9 @@
#endif
#endif
endif ! useGPU
endif ! useGPU
#else /* WITH_OPENMP */
#else /* WITH_OPENMP */
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! every primary cache
! every primary cache
! Suggested stripe width is 48 - should this be reduced for the complex case ???
! Suggested stripe width is 48 - should this be reduced for the complex case ???
...
@@ -553,7 +555,7 @@
...
@@ -553,7 +555,7 @@
#endif
#endif
print *,"trans_ev_tridi_to_band_&
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&
^
: error when allocating aIntern"//errorMessage
&: error when allocating aIntern"//errorMessage
stop
stop
endif
endif
...
@@ -721,9 +723,9 @@
...
@@ -721,9 +723,9 @@
#endif /* WITH_MPI */
#endif /* WITH_MPI */
call unpack_row_&
call unpack_row_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&_cpu_&
&_cpu_&
&PRECISION &
&PRECISION &
(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif ! useGPU
endif ! useGPU
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP */
...
@@ -776,9 +778,9 @@
...
@@ -776,9 +778,9 @@
!$omp parallel do private(my_thread), schedule(static, 1)
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
do my_thread = 1, max_threads
call unpack_row_&
call unpack_row_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&_cpu_openmp_&
&_cpu_openmp_&
&PRECISION &