Commit 50d4667f authored by Andreas Marek's avatar Andreas Marek

Merge branch 'ELPA_GPU' into ELPA_KNL

parents 00a3c9e3 6b5ffc79
This diff is collapsed.
......@@ -50,6 +50,9 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1_compute_real_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90
lib_LTLIBRARIES = libelpa@SUFFIX@.la
......@@ -809,6 +812,9 @@ EXTRA_DIST = \
src/elpa1_compute_complex_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \
src/elpa_qr/elpa_qrkernels.X90 \
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu \
......
This diff is collapsed.
......@@ -67,234 +67,25 @@ module complex_generic_simple_kernel
#endif
contains
! the intel compiler creates a temp copy of array q
! this should be avoided without using assumed size arrays
subroutine single_hh_trafo_complex_generic_simple_double(q, hh, nb, nq, ldq)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck8), intent(inout) :: q(ldq,*)
complex(kind=ck8), intent(in) :: hh(*)
#else
complex(kind=ck8), intent(inout) :: q(1:ldq,1:nb)
complex(kind=ck8), intent(in) :: hh(1:nb)
#endif
integer(kind=ik) :: i
complex(kind=ck8) :: h1, tau1, x(nq)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel complex generic simple: single_hh_trafo_complex_generic_simple_double")
#endif
! Just one Householder transformation
x(1:nq) = q(1:nq,1)
do i=2,nb
x(1:nq) = x(1:nq) + q(1:nq,i)*conjg(hh(i))
enddo
tau1 = hh(1)
x(1:nq) = x(1:nq)*(-tau1)
q(1:nq,1) = q(1:nq,1) + x(1:nq)
do i=2,nb
q(1:nq,i) = q(1:nq,i) + x(1:nq)*hh(i)
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel complex generic simple: single_hh_trafo_complex_generic_simple_double")
#endif
end subroutine single_hh_trafo_complex_generic_simple_double
! --------------------------------------------------------------------------------------------------
subroutine double_hh_trafo_complex_generic_simple_double(q, hh, nb, nq, ldq, ldh)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck8), intent(inout) :: q(ldq,*)
complex(kind=ck8), intent(in) :: hh(ldh,*)
#else
complex(kind=ck8), intent(inout) :: q(1:ldq,1:nb+1)
complex(kind=ck8), intent(in) :: hh(1:ldh,1:2)
#endif
complex(kind=ck8) :: s, h1, h2, tau1, tau2, x(nq), y(nq)
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel complex generic simple: double_hh_trafo_complex_generic_simple_double")
#endif
! Calculate dot product of the two Householder vectors
s = conjg(hh(2,2))*1
do i=3,nb
s = s+(conjg(hh(i,2))*hh(i-1,1))
enddo
! Do the Householder transformations
x(1:nq) = q(1:nq,2)
y(1:nq) = q(1:nq,1) + q(1:nq,2)*conjg(hh(2,2))
do i=3,nb
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
x(1:nq) = x(1:nq) + q(1:nq,i)*h1
y(1:nq) = y(1:nq) + q(1:nq,i)*h2
enddo
x(1:nq) = x(1:nq) + q(1:nq,nb+1)*conjg(hh(nb,1))
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x(1:nq) = x(1:nq)*h1
h1 = -tau2
h2 = -tau2*s
y(1:nq) = y(1:nq)*h1 + x(1:nq)*h2
q(1:nq,1) = q(1:nq,1) + y(1:nq)
q(1:nq,2) = q(1:nq,2) + x(1:nq) + y(1:nq)*hh(2,2)
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1:nq,i) = q(1:nq,i) + x(1:nq)*h1 + y(1:nq)*h2
enddo
q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel complex generic simple: double_hh_trafo_complex_generic_simple_double")
#endif
end subroutine double_hh_trafo_complex_generic_simple_double
#define COMPLEXCASE 1
#define DOUBLE_PRECISION_COMPLEX 1
#define DATATYPE ck8
#include "elpa2_kernels_simple_template.X90"
#undef DOUBLE_PRECISION_COMPLEX
#undef DATATYPE
#undef COMPLEXCASE
#ifdef WANT_SINGLE_PRECISION_COMPLEX
! SINGLE PRECISION IMPLEMENTATION, AT THE MOMENT DUPLICATED !!!
subroutine single_hh_trafo_complex_generic_simple_single(q, hh, nb, nq, ldq)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck8), intent(inout) :: q(ldq,*)
complex(kind=ck4), intent(in) :: hh(*)
#else
complex(kind=ck4), intent(inout) :: q(1:ldq,1:nb)
complex(kind=ck4), intent(in) :: hh(1:nb)
#endif
integer(kind=ik) :: i
complex(kind=ck4) :: h1, tau1, x(nq)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel complex generic simple: single_hh_trafo_complex_generic_simple_single")
#endif
! Just one Householder transformation
x(1:nq) = q(1:nq,1)
do i=2,nb
x(1:nq) = x(1:nq) + q(1:nq,i)*conjg(hh(i))
enddo
tau1 = hh(1)
x(1:nq) = x(1:nq)*(-tau1)
q(1:nq,1) = q(1:nq,1) + x(1:nq)
do i=2,nb
q(1:nq,i) = q(1:nq,i) + x(1:nq)*hh(i)
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel complex generic simple: single_hh_trafo_complex_generic_simple_single")
#endif
end subroutine single_hh_trafo_complex_generic_simple_single
! --------------------------------------------------------------------------------------------------
subroutine double_hh_trafo_complex_generic_simple_single(q, hh, nb, nq, ldq, ldh)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck4), intent(inout) :: q(ldq,*)
complex(kind=ck4), intent(in) :: hh(ldh,*)
#else
complex(kind=ck4), intent(inout) :: q(1:ldq,1:nb+1)
complex(kind=ck4), intent(in) :: hh(1:ldh,1:2)
#endif
complex(kind=ck4) :: s, h1, h2, tau1, tau2, x(nq), y(nq)
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel complex generic simple: double_hh_trafo_complex_generic_simple_single")
#endif
! Calculate dot product of the two Householder vectors
s = conjg(hh(2,2))*1
do i=3,nb
s = s+(conjg(hh(i,2))*hh(i-1,1))
enddo
! Do the Householder transformations
x(1:nq) = q(1:nq,2)
y(1:nq) = q(1:nq,1) + q(1:nq,2)*conjg(hh(2,2))
do i=3,nb
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
x(1:nq) = x(1:nq) + q(1:nq,i)*h1
y(1:nq) = y(1:nq) + q(1:nq,i)*h2
enddo
x(1:nq) = x(1:nq) + q(1:nq,nb+1)*conjg(hh(nb,1))
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x(1:nq) = x(1:nq)*h1
h1 = -tau2
h2 = -tau2*s
y(1:nq) = y(1:nq)*h1 + x(1:nq)*h2
q(1:nq,1) = q(1:nq,1) + y(1:nq)
q(1:nq,2) = q(1:nq,2) + x(1:nq) + y(1:nq)*hh(2,2)
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1:nq,i) = q(1:nq,i) + x(1:nq)*h1 + y(1:nq)*h2
enddo
q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel complex generic simple: double_hh_trafo_complex_generic_simple_single")
#define COMPLEXCASE 1
#undef DOUBLE_PRECISION_COMPLEX
#define DATATYPE ck4
#include "elpa2_kernels_simple_template.X90"
#undef DOUBLE_PRECISION_COMPLEX
#undef DATATYPE
#undef COMPLEXCASE
#endif
end subroutine double_hh_trafo_complex_generic_simple_single
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
end module complex_generic_simple_kernel
! --------------------------------------------------------------------------------------------------
This diff is collapsed.
This diff is collapsed.
......@@ -68,152 +68,24 @@ module real_generic_simple_kernel
#endif
contains
! the intel compiler creates an temp array copy of array q
! This should be prevented if possible without using assumed size arrays
subroutine double_hh_trafo_generic_simple_double(q, hh, nb, nq, ldq, ldh)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk8), intent(inout) :: q(ldq,*)
real(kind=rk8), intent(in) :: hh(ldh,*)
#else
real(kind=rk8), intent(inout) :: q(ldq,1:nb+1)
real(kind=rk8), intent(in) :: hh(ldh,2)
#endif
real(kind=rk8) :: s, h1, h2, tau1, tau2, x(nq), y(nq)
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic simple: double_hh_trafo_generic_simple_double")
#endif
! Calculate dot product of the two Householder vectors
s = hh(2,2)*1
do i=3,nb
s = s+hh(i,2)*hh(i-1,1)
enddo
! Do the Householder transformations
x(1:nq) = q(1:nq,2)
y(1:nq) = q(1:nq,1) + q(1:nq,2)*hh(2,2)
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
x(1:nq) = x(1:nq) + q(1:nq,i)*h1
y(1:nq) = y(1:nq) + q(1:nq,i)*h2
enddo
x(1:nq) = x(1:nq) + q(1:nq,nb+1)*hh(nb,1)
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x(1:nq) = x(1:nq)*h1
h1 = -tau2
h2 = -tau2*s
y(1:nq) = y(1:nq)*h1 + x(1:nq)*h2
q(1:nq,1) = q(1:nq,1) + y(1:nq)
q(1:nq,2) = q(1:nq,2) + x(1:nq) + y(1:nq)*hh(2,2)
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1:nq,i) = q(1:nq,i) + x(1:nq)*h1 + y(1:nq)*h2
enddo
q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic simple: double_hh_trafo_generic_simple_double")
#endif
end subroutine double_hh_trafo_generic_simple_double
#define REALCASE 1
#define DOUBLE_PRECISION_REAL 1
#define DATATYPE rk8
#include "elpa2_kernels_simple_template.X90"
#undef DOUBLE_PRECISION_REAL
#undef DATATYPE
#undef REALCASE
#ifdef WANT_SINGLE_PRECISION_REAL
! single precision implementation at the moment duplicated !!!
subroutine double_hh_trafo_generic_simple_single(q, hh, nb, nq, ldq, ldh)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#define REALCASE 1
#undef DOUBLE_PRECISION_REAL
#define DATATYPE rk4
#include "elpa2_kernels_simple_template.X90"
#undef DOUBLE_PRECISION_REAL
#undef DATATYPE
#undef REALCASE
#endif
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk4), intent(inout) :: q(ldq,*)
real(kind=rk4), intent(in) :: hh(ldh,*)
#else
real(kind=rk4), intent(inout) :: q(ldq,1:nb+1)
real(kind=rk4), intent(in) :: hh(ldh,2)
#endif
real(kind=rk4) :: s, h1, h2, tau1, tau2, x(nq), y(nq)
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic simple: double_hh_trafo_generic_simple_single")
#endif
! Calculate dot product of the two Householder vectors
s = hh(2,2)*1
do i=3,nb
s = s+hh(i,2)*hh(i-1,1)
enddo
! Do the Householder transformations
x(1:nq) = q(1:nq,2)
y(1:nq) = q(1:nq,1) + q(1:nq,2)*hh(2,2)
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
x(1:nq) = x(1:nq) + q(1:nq,i)*h1
y(1:nq) = y(1:nq) + q(1:nq,i)*h2
enddo
x(1:nq) = x(1:nq) + q(1:nq,nb+1)*hh(nb,1)
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x(1:nq) = x(1:nq)*h1
h1 = -tau2
h2 = -tau2*s
y(1:nq) = y(1:nq)*h1 + x(1:nq)*h2
q(1:nq,1) = q(1:nq,1) + y(1:nq)
q(1:nq,2) = q(1:nq,2) + x(1:nq) + y(1:nq)*hh(2,2)
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1:nq,i) = q(1:nq,i) + x(1:nq)*h1 + y(1:nq)*h2
enddo
q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic simple: double_hh_trafo_generic_simple_single")
#endif
end subroutine double_hh_trafo_generic_simple_single
#endif /* WANT_SINGLE_PRECISION_REAL */
end module real_generic_simple_kernel
! --------------------------------------------------------------------------------------------------
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment