Commit b6e1b918 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master_fix_up' into ELPA_KNL

parents aebf900d 303c02e6
......@@ -52,22 +52,19 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1_compute_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_bandred_real_template.X90 \
src/elpa2_template.X90 \
src/elpa2_bandred_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \
src/elpa2_tridiag_band_real_template.X90 \
src/elpa2_trans_ev_band_to_full_template.X90 \
src/elpa2_tridiag_band_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_bandred_complex_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \
src/elpa2_tridiag_band_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \
src/precision_macros.h \
src/precision_macros_complex.h
src/precision_macros.h
lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION)
......@@ -356,6 +353,7 @@ dist_files_DATA = \
test/Fortran/test_real2_default.F90 \
test/Fortran/test_real2_qr.F90 \
test/Fortran/test_real2_api.F90 \
test/Fortran/test_real2_banded.F90 \
test/Fortran/test_real.F90 \
test/Fortran/test_real_with_c.F90 \
test/Fortran/test_toeplitz.F90 \
......@@ -386,6 +384,7 @@ noinst_PROGRAMS = \
elpa2_test_real_default@SUFFIX@ \
elpa2_test_real_qr@SUFFIX@ \
elpa2_test_real_api@SUFFIX@ \
elpa2_test_real_banded@SUFFIX@ \
elpa2_test_complex@SUFFIX@ \
elpa2_test_complex_default@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \
......@@ -573,6 +572,11 @@ elpa2_test_real_api@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_api@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real_api@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa2_test_real_banded@SUFFIX@_SOURCES = test/Fortran/test_real2_banded.F90
elpa2_test_real_banded@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real_banded@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa1_test_complex@SUFFIX@_SOURCES = test/Fortran/test_complex.F90
elpa1_test_complex@SUFFIX@_LDADD = $(build_lib)
elpa1_test_complex@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
......@@ -772,6 +776,7 @@ check_SCRIPTS = \
elpa2_test_real_qr@SUFFIX@.sh \
elpa2_test_complex_default@SUFFIX@.sh \
elpa2_test_real_api@SUFFIX@.sh \
elpa2_test_real_banded@SUFFIX@.sh \
elpa2_test_complex_api@SUFFIX@.sh \
elpa_driver_real@SUFFIX@.sh \
elpa_driver_complex@SUFFIX@.sh \
......@@ -942,18 +947,15 @@ EXTRA_DIST = \
src/elpa1_tridiag_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_bandred_complex_template.X90 \
src/elpa2_bandred_real_template.X90 \
src/elpa2_bandred_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \
src/elpa2_template.X90 \
src/elpa2_tridiag_band_template.X90 \
src/elpa2_trans_ev_band_to_full_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_tridiag_band_complex_template.X90 \
src/elpa2_tridiag_band_real_template.X90 \
src/precision_macros.h \
src/precision_macros_complex.h \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
......
......@@ -2,9 +2,9 @@
import sys
simple_tokens = [
"PRECISION",
"elpa_transpose_vectors_NUMBER_PRECISION",
"elpa_reduce_add_vectors_NUMBER_PRECISION",
"bandred_NUMBER_PRECISION",
"trans_ev_band_to_full_NUMBER_PRECISION",
"tridiag_band_NUMBER_PRECISION",
......@@ -15,10 +15,11 @@ simple_tokens = [
"solve_tridi_PRECISION",
"solve_tridi_col_PRECISION",
"solve_tridi_single_problem_PRECISION",
"solve_evp_NUMBER_2stage_PRECISION",
"qr_pdgeqrf_2dcomm_PRECISION",
"hh_transform_NUMBER_PRECISION",
"symm_matrix_allreduce_PRECISION",
"herm_matrix_allreduce_PRECISION",
"redist_band_NUMBER_PRECISION",
"unpack_row_NUMBER_cpu_PRECISION",
"unpack_row_NUMBER_cpu_openmp_PRECISION",
......@@ -45,9 +46,19 @@ simple_tokens = [
"global_product_PRECISION",
"add_tmp_PRECISION",
"v_add_s_PRECISION",
"launch_compute_hh_trafo_c_kernel_NUMBER_PRECISION",
"compute_hh_trafo_NUMBER_gpu_PRECISION",
"launch_my_pack_c_kernel_NUMBER_PRECISION",
"launch_my_unpack_c_kernel_NUMBER_PRECISION",
"launch_compute_hh_dotp_c_kernel_NUMBER_PRECISION",
"launch_extract_hh_tau_c_kernel_NUMBER_PRECISION",
"AVAILABLE_UPCASENUMBER_ELPA_KERNELS",
"UPCASENUMBER_ELPA_KERNEL_GENERIC",
"DEFAULT_UPCASENUMBER_ELPA_KERNEL",
"UPCASENUMBER_ELPA_KERNEL_NAMES",
"UPCASENUMBER_ELPA_KERNEL_GPU",
]
blas_tokens = [
"PRECISION_GEMV",
"PRECISION_TRMV",
......@@ -57,6 +68,8 @@ blas_tokens = [
"PRECISION_SYRK",
"PRECISION_SYMV",
"PRECISION_SYMM",
"PRECISION_HEMV",
"PRECISION_HER2",
"PRECISION_SYR2",
"PRECISION_SYR2K",
"PRECISION_GEQRF",
......@@ -75,6 +88,7 @@ blas_tokens = [
explicit_tokens_complex = [
("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
("MPI_COMPLEX_PRECISION", "MPI_DOUBLE_COMPLEX", "MPI_COMPLEX"),
("MPI_COMPLEX_EXPLICIT_PRECISION", "MPI_COMPLEX16", "MPI_COMPLEX8"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
("KIND_PRECISION", "rk8", "rk4"),
("PRECISION_CMPLX", "DCMPLX", "CMPLX"),
......@@ -82,8 +96,15 @@ explicit_tokens_complex = [
("PRECISION_REAL", "DREAL", "REAL"),
("CONST_REAL_0_0", "0.0_rk8", "0.0_rk4"),
("CONST_REAL_1_0", "1.0_rk8", "1.0_rk4"),
("CONST_REAL_0_5", "0.5_rk8", "0.5_rk4"),
("CONST_COMPLEX_PAIR_0_0", "(0.0_rk8,0.0_rk8)", "(0.0_rk4,0.0_rk4)"),
("CONST_COMPLEX_PAIR_1_0", "(1.0_rk8,0.0_rk8)", "(1.0_rk4,0.0_rk4)"),
("CONST_COMPLEX_PAIR_NEGATIVE_1_0", "(-1.0_rk8,0.0_rk8)", "(-1.0_rk4,0.0_rk4)"),
("CONST_COMPLEX_PAIR_NEGATIVE_0_5", "(-0.5_rk8,0.0_rk8)", "(-0.5_rk4,0.0_rk4)"),
("CONST_COMPLEX_0_0", "0.0_ck8", "0.0_ck4"),
("CONST_COMPLEX_1_0", "1.0_ck8", "1.0_ck4"),
("size_of_PRECISION_complex", "size_of_double_complex_datatype", "size_of_single_complex_datatype"),
("C_DATATYPE_KIND", "c_double", "c_float"),
]
explicit_tokens_real = [
......@@ -95,6 +116,7 @@ explicit_tokens_real = [
("CONST_8_0", "8.0_rk8", "8.0_rk4"),
("size_of_PRECISION_real", "size_of_double_real_datatype", "size_of_single_real_datatype"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
("C_DATATYPE_KIND", "c_double", "c_float"),
]
......@@ -103,7 +125,10 @@ blas_prefixes = {("real","single") : "S", ("real","double") : "D", ("complex","s
def print_variant(number, precision, explicit):
for token in simple_tokens:
print "#define ", token.replace("NUMBER", number), token.replace("PRECISION", precision).replace("NUMBER", number)
print "#define ", token, token.replace("PRECISION", precision).replace("UPCASENUMBER", number.upper()).replace("NUMBER", number)
print "#define ", token + "_STR", "'" + token.replace("PRECISION", precision).replace("UPCASENUMBER", number.upper()).replace("NUMBER", number) + "'"
if("NUMBER" in token):
print "#define ", token.replace("NUMBER", number), token.replace("PRECISION", precision).replace("NUMBER", number)
for token in blas_tokens:
print "#define ", token, token.replace("PRECISION_", blas_prefixes[(number, precision)])
for token in explicit:
......@@ -111,28 +136,51 @@ def print_variant(number, precision, explicit):
def print_undefs(number, explicit):
for token in simple_tokens:
print "#undef ", token.replace("NUMBER", number)
print "#undef ", token
print "#undef ", token + "_STR"
if("NUMBER" in token):
print "#undef ", token.replace("NUMBER", number)
for token in blas_tokens:
print "#undef ", token
for token in explicit:
print "#undef ", token[0]
if(sys.argv[1] == "complex"):
print "#ifdef DOUBLE_PRECISION_COMPLEX"
print_undefs("complex", explicit_tokens_complex)
print_variant("complex", "double", explicit_tokens_complex)
print "#else"
print_undefs("complex", explicit_tokens_complex)
print_variant("complex", "single", explicit_tokens_complex)
print "#endif"
elif(sys.argv[1] == "real"):
print "#ifdef DOUBLE_PRECISION_REAL"
print_undefs("real", explicit_tokens_real)
print_variant("real", "double", explicit_tokens_real)
print "#else"
print_undefs("real", explicit_tokens_real)
print_variant("real", "single", explicit_tokens_real)
print "#endif"
else:
assert(False)
\ No newline at end of file
print "#ifdef REALCASE"
print "#undef MATH_DATATYPE"
print "#define MATH_DATATYPE real"
print_undefs("real", explicit_tokens_real)
#print_undefs("complex", explicit_tokens_complex)
print "#ifdef DOUBLE_PRECISION"
print_variant("real", "double", explicit_tokens_real)
print "#endif"
print "#ifdef SINGLE_PRECISION"
print_variant("real", "single", explicit_tokens_real)
print "#endif"
print "#endif"
print "#ifdef COMPLEXCASE"
print "#undef MATH_DATATYPE"
print "#define MATH_DATATYPE complex"
#print_undefs("real", explicit_tokens_real)
print_undefs("complex", explicit_tokens_complex)
print "#ifdef DOUBLE_PRECISION"
print_variant("complex", "double", explicit_tokens_complex)
print "#endif"
print "#ifdef SINGLE_PRECISION"
print_variant("complex", "single", explicit_tokens_complex)
print "#endif"
print "#endif"
#print "#elif MACROS_TYPE == COMPLEX_DOUBLE"
#print "#undef NUMBER"
#print_undefs("complex", explicit_tokens_complex)
#print "#define NUMBER complex"
#print_variant("complex", "double", explicit_tokens_complex)
#print "#elif MACROS_TYPE == COMPLEX_SINGLE"
#print "#undef NUMBER"
#print_undefs("complex", explicit_tokens_complex)
#print "#define NUMBER complex"
#print_variant("complex", "single", explicit_tokens_complex)
#print "#endif"
......@@ -159,12 +159,14 @@ module ELPA1_COMPUTE
#define DATATYPE REAL(kind=rk8)
#define BYTESIZE 8
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DOUBLE_PRECISION_REAL
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef DOUBLE_PRECISION
! single precision
#ifdef WANT_SINGLE_PRECISION_REAL
......@@ -173,11 +175,13 @@ module ELPA1_COMPUTE
#define DATATYPE REAL(kind=rk4)
#define BYTESIZE 4
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef SINGLE_PRECISION
#endif
......@@ -187,11 +191,13 @@ module ELPA1_COMPUTE
#define DATATYPE COMPLEX(kind=ck8)
#define BYTESIZE 16
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
......@@ -200,11 +206,13 @@ module ELPA1_COMPUTE
#undef DOUBLE_PRECISION_REAL
#define DATATYPE COMPLEX(kind=ck4)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
......@@ -213,6 +221,9 @@ module ELPA1_COMPUTE
#define REAL_DATATYPE rk8
#define REALCASE 1
#define DOUBLE_PRECISION 1
! remove? :
#undef COMPLEXCASE
#include "elpa1_compute_template.X90"
......@@ -220,6 +231,7 @@ module ELPA1_COMPUTE
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef REALCASE
#undef DOUBLE_PRECISION
! real single precision
#if defined(WANT_SINGLE_PRECISION_REAL)
......@@ -228,11 +240,15 @@ module ELPA1_COMPUTE
#define REAL_DATATYPE rk4
#define REALCASE 1
#define SINGLE_PRECISION 1
!remove? :
#undef COMPLEXCASE
#include "elpa1_compute_template.X90"
#undef REALCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
......@@ -245,10 +261,14 @@ module ELPA1_COMPUTE
#define COMPLEX_DATATYPE ck8
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
! remove? :
#undef REALCASE
#include "elpa1_compute_template.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef REAL_DATATYPE
......@@ -263,11 +283,14 @@ module ELPA1_COMPUTE
#define COMPLEX_DATATYPE ck4
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
!remove ? :
#undef REALCASE
#include "elpa1_compute_template.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef COMPLEX_DATATYPE
......
......@@ -52,13 +52,7 @@
! distributed along with the original code in the file "COPYING".
#endif
#if REALCASE == 1
#include "precision_macros.h"
#endif
#if COMPLEXCASE == 1
#include "precision_macros_complex.h"
#endif
#if REALCASE == 1
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -167,8 +167,12 @@ module ELPA2_compute
#define DOUBLE_PRECISION_REAL 1
#define REAL_DATATYPE rk8
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "elpa2_compute_real_template.X90"
#undef REALCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
......@@ -179,8 +183,12 @@ module ELPA2_compute
#undef DOUBLE_PRECISION_REAL
#define REAL_DATATYPE rk4
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "elpa2_compute_real_template.X90"
#undef REALCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
......@@ -192,8 +200,12 @@ module ELPA2_compute
#define REAL_DATATYPE rk8
#define COMPLEX_DATATYPE ck8
#include "precision_macros_complex.h"
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "elpa2_compute_complex_template.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef REAL_DATATYPE
......@@ -207,8 +219,12 @@ module ELPA2_compute
#define REAL_DATATYPE rk4
#define COMPLEX_DATATYPE ck4
#include "precision_macros_complex.h"
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "elpa2_compute_complex_template.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef COMPLEX_DATATYPE
......
......@@ -61,35 +61,28 @@
! distributed along with the original code in the file "COPYING".
#endif
#include "elpa2_bandred_complex_template.X90"
#define COMPLEXCASE 1
#undef REALCASE
#include "elpa2_bandred_template.X90"
#undef COMPLEXCASE
#include "elpa2_herm_matrix_allreduce_complex_template.X90"
#include "elpa2_trans_ev_band_to_full_complex_template.X90"
#include "elpa2_tridiag_band_complex_template.X90"
#define COMPLEXCASE 1
#include "elpa2_trans_ev_band_to_full_template.X90"
#include "elpa2_tridiag_band_template.X90"
#undef COMPLEXCASE
#include "elpa2_trans_ev_tridi_to_band_complex_template.X90"
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine compute_hh_dot_products_complex_gpu_double(nbw, n)
#else
subroutine compute_hh_dot_products_complex_gpu_single(nbw, n)
#endif
subroutine compute_hh_dot_products_complex_gpu_PRECISION(nbw, n)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
if (n .le. 1) return
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_compute_hh_dotp_c_kernel_complex_double( bcast_buffer_dev, hh_dot_dev, nbw,n)
#else
call launch_compute_hh_dotp_c_kernel_complex_single( bcast_buffer_dev, hh_dot_dev, nbw,n)
#endif
call launch_compute_hh_dotp_c_kernel_complex_PRECISION( bcast_buffer_dev, hh_dot_dev, nbw,n)
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine pack_row_group_complex_gpu_double(rows, n_offset, row_count)
#else
subroutine pack_row_group_complex_gpu_single(rows, n_offset, row_count)
#endif
subroutine pack_row_group_complex_gpu_PRECISION(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
......@@ -99,28 +92,18 @@
logical :: successCUDA
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_my_pack_c_kernel_complex_double(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
call launch_my_pack_c_kernel_complex_PRECISION(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, aIntern_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_double_complex_datatype, &
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_PRECISION_complex, &
cudaMemcpyDeviceToHost)
#else
call launch_my_pack_c_kernel_complex_single(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, aIntern_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_single_complex_datatype, &
cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"pack_row_group_complex_gpu: error in cudaMemcpy"
stop
endif
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine unpack_row_group_complex_gpu_double(rows, n_offset, row_count)
#else
subroutine unpack_row_group_complex_gpu_single(rows, n_offset, row_count)
#endif
subroutine unpack_row_group_complex_gpu_PRECISION(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
......@@ -131,31 +114,17 @@
logical :: successCUDA
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_double_complex_datatype , &
cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_single_complex_datatype , &
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_PRECISION_complex , &
cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"unpack_row_group_complex_gpu: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_my_unpack_c_kernel_complex_double( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,aIntern_dev)
#else
call launch_my_unpack_c_kernel_complex_single( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
call launch_my_unpack_c_kernel_complex_PRECISION( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,aIntern_dev)
#endif
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine unpack_and_prepare_row_group_complex_gpu_double(next_unpack_idx, force)
#else
subroutine unpack_and_prepare_row_group_complex_gpu_single(next_unpack_idx, force)
#endif
subroutine unpack_and_prepare_row_group_complex_gpu_PRECISION(next_unpack_idx, force)
use precision
implicit none
......@@ -168,11 +137,7 @@
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /=next_unpack_idx)) then
! A flush and a reset must performed
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_row_group_complex_gpu_double(row_group(:, :), unpack_idx - row_group_size, row_group_size)
#else
call unpack_row_group_complex_gpu_single(row_group(:, :), unpack_idx - row_group_size, row_group_size)
#endif
call unpack_row_group_complex_gpu_PRECISION(row_group(:, :), unpack_idx - row_group_size, row_group_size)
row_group_size = 1
else
! Just prepare for the upcoming row
......@@ -184,11 +149,7 @@
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine compute_hh_trafo_complex_gpu_double(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
#else
subroutine compute_hh_trafo_complex_gpu_single(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
#endif
subroutine compute_hh_trafo_complex_gpu_PRECISION(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
use iso_c_binding
use cuda_c_kernel
......@@ -205,27 +166,15 @@
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
#ifdef DOUBLE_PRECISION_COMPLEX
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_double_complex_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_double_complex_datatype
dev_offset_2 =( off-1 )*size_of_double_complex_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_double(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#else
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_single_complex_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_single_complex_datatype
dev_offset_2 =( off-1 )*size_of_single_complex_datatype
size_of_PRECISION_complex
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_PRECISION_complex
dev_offset_2 =( off-1 )*size_of_PRECISION_complex
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_single(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
call launch_compute_hh_trafo_c_kernel_complex_PRECISION(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#endif
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
......
......@@ -60,11 +60,15 @@
! distributed along with the original code in the file "COPYING".
#endif
#include "elpa2_bandred_real_template.X90"
#define REALCASE 1
#undef COMPLEXCASE
#include "elpa2_bandred_template.X90"
#undef REALCASE
#include "elpa2_symm_matrix_allreduce_real_template.X90"
#include "elpa2_trans_ev_band_to_full_real_template.X90"
#include "elpa2_tridiag_band_real_template.X90"
#define REALCASE 1
#include "elpa2_trans_ev_band_to_full_template.X90"
#include "elpa2_tridiag_band_template.X90"
#undef REALCASE
#include "elpa2_trans_ev_tridi_to_band_real_template.X90"
......
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine herm_matrix_allreduce_double(n,a,lda,ldb,comm)
#else
subroutine herm_matrix_allreduce_single(n,a,lda,ldb,comm)
#endif
subroutine herm_matrix_allreduce_PRECISION(n,a,lda,ldb,comm)
!-------------------------------------------------------------------------------
! herm_matrix_allreduce: Does an mpi_allreduce for a hermitian matrix A.
! On entry, only the upper half of A needs to be set
! On exit, the complete matrix is set
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
......@@ -20,13 +18,7 @@
integer(kind=ik) :: i, nc, mpierr
complex(kind=COMPLEX_DATATYPE) :: h1(n*n), h2(n*n)
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("herm_matrix_allreduce_double")
#else
call timer%start("herm_matrix_allreduce_single")
#endif
#endif
call timer%start("herm_matrix_allreduce" // PRECISION_SUFFIX)
nc = 0
do i=1,n
......@@ -34,18 +26,9 @@
nc = nc+i