There is a maintenance of MPCDF Gitlab on Thursday, April 22st 2020, 9:00 am CEST - Expect some service interruptions during this time

Commit b6e1b918 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master_fix_up' into ELPA_KNL

parents aebf900d 303c02e6
......@@ -52,22 +52,19 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1_compute_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_bandred_real_template.X90 \
src/elpa2_template.X90 \
src/elpa2_bandred_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \
src/elpa2_tridiag_band_real_template.X90 \
src/elpa2_trans_ev_band_to_full_template.X90 \
src/elpa2_tridiag_band_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_bandred_complex_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \
src/elpa2_tridiag_band_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \
src/precision_macros.h \
src/precision_macros_complex.h
src/precision_macros.h
lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION)
......@@ -356,6 +353,7 @@ dist_files_DATA = \
test/Fortran/test_real2_default.F90 \
test/Fortran/test_real2_qr.F90 \
test/Fortran/test_real2_api.F90 \
test/Fortran/test_real2_banded.F90 \
test/Fortran/test_real.F90 \
test/Fortran/test_real_with_c.F90 \
test/Fortran/test_toeplitz.F90 \
......@@ -386,6 +384,7 @@ noinst_PROGRAMS = \
elpa2_test_real_default@SUFFIX@ \
elpa2_test_real_qr@SUFFIX@ \
elpa2_test_real_api@SUFFIX@ \
elpa2_test_real_banded@SUFFIX@ \
elpa2_test_complex@SUFFIX@ \
elpa2_test_complex_default@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \
......@@ -573,6 +572,11 @@ elpa2_test_real_api@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_api@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real_api@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa2_test_real_banded@SUFFIX@_SOURCES = test/Fortran/test_real2_banded.F90
elpa2_test_real_banded@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real_banded@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa1_test_complex@SUFFIX@_SOURCES = test/Fortran/test_complex.F90
elpa1_test_complex@SUFFIX@_LDADD = $(build_lib)
elpa1_test_complex@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
......@@ -772,6 +776,7 @@ check_SCRIPTS = \
elpa2_test_real_qr@SUFFIX@.sh \
elpa2_test_complex_default@SUFFIX@.sh \
elpa2_test_real_api@SUFFIX@.sh \
elpa2_test_real_banded@SUFFIX@.sh \
elpa2_test_complex_api@SUFFIX@.sh \
elpa_driver_real@SUFFIX@.sh \
elpa_driver_complex@SUFFIX@.sh \
......@@ -942,18 +947,15 @@ EXTRA_DIST = \
src/elpa1_tridiag_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_bandred_complex_template.X90 \
src/elpa2_bandred_real_template.X90 \
src/elpa2_bandred_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \
src/elpa2_template.X90 \
src/elpa2_tridiag_band_template.X90 \
src/elpa2_trans_ev_band_to_full_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_tridiag_band_complex_template.X90 \
src/elpa2_tridiag_band_real_template.X90 \
src/precision_macros.h \
src/precision_macros_complex.h \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
......
......@@ -2,9 +2,9 @@
import sys
simple_tokens = [
"PRECISION",
"elpa_transpose_vectors_NUMBER_PRECISION",
"elpa_reduce_add_vectors_NUMBER_PRECISION",
"bandred_NUMBER_PRECISION",
"trans_ev_band_to_full_NUMBER_PRECISION",
"tridiag_band_NUMBER_PRECISION",
......@@ -15,10 +15,11 @@ simple_tokens = [
"solve_tridi_PRECISION",
"solve_tridi_col_PRECISION",
"solve_tridi_single_problem_PRECISION",
"solve_evp_NUMBER_2stage_PRECISION",
"qr_pdgeqrf_2dcomm_PRECISION",
"hh_transform_NUMBER_PRECISION",
"symm_matrix_allreduce_PRECISION",
"herm_matrix_allreduce_PRECISION",
"redist_band_NUMBER_PRECISION",
"unpack_row_NUMBER_cpu_PRECISION",
"unpack_row_NUMBER_cpu_openmp_PRECISION",
......@@ -45,9 +46,19 @@ simple_tokens = [
"global_product_PRECISION",
"add_tmp_PRECISION",
"v_add_s_PRECISION",
"launch_compute_hh_trafo_c_kernel_NUMBER_PRECISION",
"compute_hh_trafo_NUMBER_gpu_PRECISION",
"launch_my_pack_c_kernel_NUMBER_PRECISION",
"launch_my_unpack_c_kernel_NUMBER_PRECISION",
"launch_compute_hh_dotp_c_kernel_NUMBER_PRECISION",
"launch_extract_hh_tau_c_kernel_NUMBER_PRECISION",
"AVAILABLE_UPCASENUMBER_ELPA_KERNELS",
"UPCASENUMBER_ELPA_KERNEL_GENERIC",
"DEFAULT_UPCASENUMBER_ELPA_KERNEL",
"UPCASENUMBER_ELPA_KERNEL_NAMES",
"UPCASENUMBER_ELPA_KERNEL_GPU",
]
blas_tokens = [
"PRECISION_GEMV",
"PRECISION_TRMV",
......@@ -57,6 +68,8 @@ blas_tokens = [
"PRECISION_SYRK",
"PRECISION_SYMV",
"PRECISION_SYMM",
"PRECISION_HEMV",
"PRECISION_HER2",
"PRECISION_SYR2",
"PRECISION_SYR2K",
"PRECISION_GEQRF",
......@@ -75,6 +88,7 @@ blas_tokens = [
explicit_tokens_complex = [
("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
("MPI_COMPLEX_PRECISION", "MPI_DOUBLE_COMPLEX", "MPI_COMPLEX"),
("MPI_COMPLEX_EXPLICIT_PRECISION", "MPI_COMPLEX16", "MPI_COMPLEX8"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
("KIND_PRECISION", "rk8", "rk4"),
("PRECISION_CMPLX", "DCMPLX", "CMPLX"),
......@@ -82,8 +96,15 @@ explicit_tokens_complex = [
("PRECISION_REAL", "DREAL", "REAL"),
("CONST_REAL_0_0", "0.0_rk8", "0.0_rk4"),
("CONST_REAL_1_0", "1.0_rk8", "1.0_rk4"),
("CONST_REAL_0_5", "0.5_rk8", "0.5_rk4"),
("CONST_COMPLEX_PAIR_0_0", "(0.0_rk8,0.0_rk8)", "(0.0_rk4,0.0_rk4)"),
("CONST_COMPLEX_PAIR_1_0", "(1.0_rk8,0.0_rk8)", "(1.0_rk4,0.0_rk4)"),
("CONST_COMPLEX_PAIR_NEGATIVE_1_0", "(-1.0_rk8,0.0_rk8)", "(-1.0_rk4,0.0_rk4)"),
("CONST_COMPLEX_PAIR_NEGATIVE_0_5", "(-0.5_rk8,0.0_rk8)", "(-0.5_rk4,0.0_rk4)"),
("CONST_COMPLEX_0_0", "0.0_ck8", "0.0_ck4"),
("CONST_COMPLEX_1_0", "1.0_ck8", "1.0_ck4"),
("size_of_PRECISION_complex", "size_of_double_complex_datatype", "size_of_single_complex_datatype"),
("C_DATATYPE_KIND", "c_double", "c_float"),
]
explicit_tokens_real = [
......@@ -95,6 +116,7 @@ explicit_tokens_real = [
("CONST_8_0", "8.0_rk8", "8.0_rk4"),
("size_of_PRECISION_real", "size_of_double_real_datatype", "size_of_single_real_datatype"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
("C_DATATYPE_KIND", "c_double", "c_float"),
]
......@@ -103,7 +125,10 @@ blas_prefixes = {("real","single") : "S", ("real","double") : "D", ("complex","s
def print_variant(number, precision, explicit):
for token in simple_tokens:
print "#define ", token.replace("NUMBER", number), token.replace("PRECISION", precision).replace("NUMBER", number)
print "#define ", token, token.replace("PRECISION", precision).replace("UPCASENUMBER", number.upper()).replace("NUMBER", number)
print "#define ", token + "_STR", "'" + token.replace("PRECISION", precision).replace("UPCASENUMBER", number.upper()).replace("NUMBER", number) + "'"
if("NUMBER" in token):
print "#define ", token.replace("NUMBER", number), token.replace("PRECISION", precision).replace("NUMBER", number)
for token in blas_tokens:
print "#define ", token, token.replace("PRECISION_", blas_prefixes[(number, precision)])
for token in explicit:
......@@ -111,28 +136,51 @@ def print_variant(number, precision, explicit):
def print_undefs(number, explicit):
for token in simple_tokens:
print "#undef ", token.replace("NUMBER", number)
print "#undef ", token
print "#undef ", token + "_STR"
if("NUMBER" in token):
print "#undef ", token.replace("NUMBER", number)
for token in blas_tokens:
print "#undef ", token
for token in explicit:
print "#undef ", token[0]
if(sys.argv[1] == "complex"):
print "#ifdef DOUBLE_PRECISION_COMPLEX"
print_undefs("complex", explicit_tokens_complex)
print_variant("complex", "double", explicit_tokens_complex)
print "#else"
print_undefs("complex", explicit_tokens_complex)
print_variant("complex", "single", explicit_tokens_complex)
print "#endif"
elif(sys.argv[1] == "real"):
print "#ifdef DOUBLE_PRECISION_REAL"
print_undefs("real", explicit_tokens_real)
print_variant("real", "double", explicit_tokens_real)
print "#else"
print_undefs("real", explicit_tokens_real)
print_variant("real", "single", explicit_tokens_real)
print "#endif"
else:
assert(False)
\ No newline at end of file
print "#ifdef REALCASE"
print "#undef MATH_DATATYPE"
print "#define MATH_DATATYPE real"
print_undefs("real", explicit_tokens_real)
#print_undefs("complex", explicit_tokens_complex)
print "#ifdef DOUBLE_PRECISION"
print_variant("real", "double", explicit_tokens_real)
print "#endif"
print "#ifdef SINGLE_PRECISION"
print_variant("real", "single", explicit_tokens_real)
print "#endif"
print "#endif"
print "#ifdef COMPLEXCASE"
print "#undef MATH_DATATYPE"
print "#define MATH_DATATYPE complex"
#print_undefs("real", explicit_tokens_real)
print_undefs("complex", explicit_tokens_complex)
print "#ifdef DOUBLE_PRECISION"
print_variant("complex", "double", explicit_tokens_complex)
print "#endif"
print "#ifdef SINGLE_PRECISION"
print_variant("complex", "single", explicit_tokens_complex)
print "#endif"
print "#endif"
#print "#elif MACROS_TYPE == COMPLEX_DOUBLE"
#print "#undef NUMBER"
#print_undefs("complex", explicit_tokens_complex)
#print "#define NUMBER complex"
#print_variant("complex", "double", explicit_tokens_complex)
#print "#elif MACROS_TYPE == COMPLEX_SINGLE"
#print "#undef NUMBER"
#print_undefs("complex", explicit_tokens_complex)
#print "#define NUMBER complex"
#print_variant("complex", "single", explicit_tokens_complex)
#print "#endif"
......@@ -159,12 +159,14 @@ module ELPA1_COMPUTE
#define DATATYPE REAL(kind=rk8)
#define BYTESIZE 8
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DOUBLE_PRECISION_REAL
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef DOUBLE_PRECISION
! single precision
#ifdef WANT_SINGLE_PRECISION_REAL
......@@ -173,11 +175,13 @@ module ELPA1_COMPUTE
#define DATATYPE REAL(kind=rk4)
#define BYTESIZE 4
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef SINGLE_PRECISION
#endif
......@@ -187,11 +191,13 @@ module ELPA1_COMPUTE
#define DATATYPE COMPLEX(kind=ck8)
#define BYTESIZE 16
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
......@@ -200,11 +206,13 @@ module ELPA1_COMPUTE
#undef DOUBLE_PRECISION_REAL
#define DATATYPE COMPLEX(kind=ck4)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
......@@ -213,6 +221,9 @@ module ELPA1_COMPUTE
#define REAL_DATATYPE rk8
#define REALCASE 1
#define DOUBLE_PRECISION 1
! remove? :
#undef COMPLEXCASE
#include "elpa1_compute_template.X90"
......@@ -220,6 +231,7 @@ module ELPA1_COMPUTE
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef REALCASE
#undef DOUBLE_PRECISION
! real single precision
#if defined(WANT_SINGLE_PRECISION_REAL)
......@@ -228,11 +240,15 @@ module ELPA1_COMPUTE
#define REAL_DATATYPE rk4
#define REALCASE 1
#define SINGLE_PRECISION 1
!remove? :
#undef COMPLEXCASE
#include "elpa1_compute_template.X90"
#undef REALCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
......@@ -245,10 +261,14 @@ module ELPA1_COMPUTE
#define COMPLEX_DATATYPE ck8
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
! remove? :
#undef REALCASE
#include "elpa1_compute_template.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef REAL_DATATYPE
......@@ -263,11 +283,14 @@ module ELPA1_COMPUTE
#define COMPLEX_DATATYPE ck4
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
!remove ? :
#undef REALCASE
#include "elpa1_compute_template.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef COMPLEX_DATATYPE
......
......@@ -52,13 +52,7 @@
! distributed along with the original code in the file "COPYING".
#endif
#if REALCASE == 1
#include "precision_macros.h"
#endif
#if COMPLEXCASE == 1
#include "precision_macros_complex.h"
#endif
#if REALCASE == 1
......
......@@ -86,12 +86,11 @@
!> \param useGPU If true, GPU version of the subroutine will be used
!>
#if REALCASE == 1
subroutine trans_ev_real_PRECISION (na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
#if COMPLEXCASE == 1
subroutine trans_ev_complex_PRECISION(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
subroutine trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION &
(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
......@@ -128,11 +127,18 @@
integer(kind=ik) :: max_stored_rows
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
real(kind=rk8), parameter :: ZERO = 0.0_rk8, ONE = 1.0_rk8
#else
real(kind=rk4), parameter :: ZERO = 0.0_rk4, ONE = 1.0_rk4
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
complex(kind=ck8), parameter :: CZERO = (0.0_rk8,0.0_rk8), CONE = (1.0_rk8,0.0_rk8)
complex(kind=ck8), parameter :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
#else
complex(kind=ck4), parameter :: CZERO = (0.0_rk4,0.0_rk4), CONE = (1.0_rk4,0.0_rk4)
complex(kind=ck4), parameter :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
#endif
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
......@@ -155,12 +161,11 @@
integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev
logical :: successCUDA
#if REALCASE == 1
call timer%start("trans_ev_real" // PRECISION_SUFFIX)
#endif
#if COMPLEXCASE == 1
call timer%start("trans_ev_complex" // PRECISION_SUFFIX)
#endif
call timer%start("trans_ev_&
&MATH_DATATYPE&
&_" // &
&PRECISION_SUFFIX &
)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
......@@ -179,54 +184,39 @@
max_stored_rows = (63/nblk+1)*nblk
allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "tmat", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmat", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmat", istat, errorMessage)
allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "h1", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "h1", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "h1", istat, errorMessage)
allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "h2", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "h2", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "h2", istat, errorMessage)
allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "tmp1", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmp1", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmp1", istat, errorMessage)
allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "tmp2", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmp2", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmp2", istat, errorMessage)
allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "hvn", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvb", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "hvn", istat, errorMessage)
allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "hvm", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvm", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "hvm", istat, errorMessage)
hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only
......@@ -241,66 +231,52 @@
#if COMPLEXCASE == 1
! In the complex case tau(2) /= 0
if (my_prow == prow(1, nblk, np_rows)) then
q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(CONE-tau(2))
q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(ONE-tau(2))
endif
#endif
if (useGPU) then
! todo: this is used only for copying hmv to device.. it should be possible to go without it
allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "hvm1", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvm1", istat, errorMessage)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_complex)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "hvm1", istat, errorMessage)
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_complex)
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_real)
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_complex)
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
! q_dev = q_mat
#if REALCASE == 1
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_real, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_complex, &
cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
#endif
endif ! useGPU
do istep = 1, na, nblk
......@@ -330,12 +306,15 @@
#ifdef WITH_MPI
call timer%start("mpi_communication")
if (nb>0) &
call MPI_Bcast(hvb, nb, &
#if REALCASE == 1
call MPI_Bcast(hvb, nb, MPI_REAL_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
&MPI_REAL_PRECISION&
#endif
#if COMPLEXCASE == 1
call MPI_Bcast(hvb, nb, MPI_COMPLEX_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
&MPI_COMPLEX_PRECISION&
#endif
, cur_pcol, mpi_comm_cols, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
......@@ -360,13 +339,12 @@
call timer%start("blas")
if (l_rows>0) &
#if REALCASE == 1
call PRECISION_SYRK('U', 'T', nstor, l_rows, &
CONST_1_0, hvm, ubound(hvm,dim=1), &
CONST_0_0, tmat, max_stored_rows)
call PRECISION_SYRK('U', 'T', &
#endif
#if COMPLEXCASE == 1