Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
a0934d4e
Commit
a0934d4e
authored
Jan 19, 2016
by
Andreas Marek
Browse files
Merge branch 'master' into ELPA_GPU
parents
98a4db33
33a94bfc
Changes
10
Expand all
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
a0934d4e
...
...
@@ -17,7 +17,12 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/check_for_gpu.F90
\
src/mod_cuda.F90
\
src/interface_c_kernel.F90
\
src/elpa2_compute.F90
\
src/mod_pack_unpack_real.F90
\
src/elpa2_kernels/mod_single_hh_trafo_real.F90
\
src/mod_compute_hh_trafo_real.F90
\
src/mod_compute_hh_trafo_complex.F90
\
src/mod_pack_unpack_complex.F90
\
src/elpa2_compute.F90
\
src/elpa2.F90
\
src/elpa_c_interface.F90
\
src/elpa_qr/qr_utils.f90
\
...
...
@@ -314,6 +319,9 @@ elpa2.i: $(top_srcdir)/src/elpa2.F90
elpa1.i
:
$(top_srcdir)/src/elpa1.F90
$(CPP)
$(CPPFLAGS)
-I
$(top_builddir)
/
-I
$(top_srcdir)
/
-c
$(top_srcdir)
/src/elpa1.F90
-o
$@
mod_compute_hh_trafo_real.i
:
$(top_srcdir)/src/mod_compute_hh_trafo_real.F90
$(CPP)
$(CPPFLAGS)
-I
$(top_builddir)
/
-c
$(top_srcdir)
/src/mod_compute_hh_trafo_real.F90
-o
$@
include
doxygen.am
CLEANFILES
=
\
...
...
src/elpa1.F90
View file @
a0934d4e
...
...
@@ -298,6 +298,8 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
nev
,
lda
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
real
(
kind
=
rk
)
::
a
(
lda
,
matrixCols
),
ev
(
na
),
q
(
ldq
,
matrixCols
)
! was
! real a(lda,*), q(ldq,*)
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
mpierr
real
(
kind
=
rk
),
allocatable
::
e
(:),
tau
(:)
...
...
@@ -397,6 +399,8 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols,
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
nev
,
lda
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
complex
(
kind
=
ck
)
::
a
(
lda
,
matrixCols
),
q
(
ldq
,
matrixCols
)
! was
! complex a(lda,*), q(ldq,*)
real
(
kind
=
rk
)
::
ev
(
na
)
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
...
...
src/elpa1_compute.F90
View file @
a0934d4e
...
...
@@ -138,6 +138,8 @@ module ELPA1_compute
integer
(
kind
=
ik
)
::
na
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
real
(
kind
=
rk
)
::
a
(
lda
,
matrixCols
),
d
(
na
),
e
(
na
),
tau
(
na
)
! was
! real a(lda,*)
integer
(
kind
=
ik
),
parameter
::
max_stored_rows
=
32
...
...
@@ -479,6 +481,8 @@ module ELPA1_compute
integer
(
kind
=
ik
)
::
na
,
nqc
,
lda
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
real
(
kind
=
rk
)
::
a
(
lda
,
matrixCols
),
q
(
ldq
,
matrixCols
),
tau
(
na
)
! was
! real a(lda,*), q(ldq,*)
integer
(
kind
=
ik
)
::
max_stored_rows
...
...
@@ -911,6 +915,8 @@ module ELPA1_compute
integer
(
kind
=
ik
)
::
na
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
complex
(
kind
=
ck
)
::
a
(
lda
,
matrixCols
),
tau
(
na
)
! was
! complex a(lda,*)
real
(
kind
=
rk
)
::
d
(
na
),
e
(
na
)
integer
(
kind
=
ik
),
parameter
::
max_stored_rows
=
32
...
...
@@ -1278,6 +1284,8 @@ module ELPA1_compute
integer
(
kind
=
ik
)
::
na
,
nqc
,
lda
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
complex
(
kind
=
ck
)
::
a
(
lda
,
matrixCols
),
q
(
ldq
,
matrixCols
),
tau
(
na
)
! was
! complex a(lda,*), q(ldq,*)
integer
(
kind
=
ik
)
::
max_stored_rows
...
...
@@ -1678,6 +1686,8 @@ module ELPA1_compute
integer
(
kind
=
ik
)
::
na
,
nev
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
real
(
kind
=
rk
)
::
d
(
na
),
e
(
na
),
q
(
ldq
,
matrixCols
)
! was
! real q(ldq,*)
integer
(
kind
=
ik
)
::
i
,
j
,
n
,
np
,
nc
,
nev1
,
l_cols
,
l_rows
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
...
...
@@ -1902,6 +1912,8 @@ module ELPA1_compute
integer
(
kind
=
ik
)
::
na
,
nev
,
nqoff
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
real
(
kind
=
rk
)
::
d
(
na
),
e
(
na
),
q
(
ldq
,
matrixCols
)
! was
! real q(ldq,*)
integer
(
kind
=
ik
),
parameter
::
min_submatrix_size
=
16
! Minimum size of the submatrices to be used
...
...
@@ -2175,6 +2187,8 @@ module ELPA1_compute
mpi_comm_cols
,
npc_0
,
npc_n
integer
(
kind
=
ik
)
::
l_col
(
na
),
p_col
(
na
),
l_col_out
(
na
),
p_col_out
(
na
)
real
(
kind
=
rk
)
::
d
(
na
),
e
,
q
(
ldq
,
matrixCols
)
! was
! real q(ldq,*)
integer
(
kind
=
ik
),
parameter
::
max_strip
=
128
...
...
@@ -3309,6 +3323,8 @@ module ELPA1_compute
integer
(
kind
=
ik
)
::
na
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
real
(
kind
=
rk
)
::
a
(
lda
,
matrixCols
)
! was
! real a(lda, *)
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
integer
(
kind
=
ik
)
::
l_cols
,
l_rows
,
l_col1
,
l_row1
,
l_colx
,
l_rowx
...
...
@@ -3490,6 +3506,8 @@ module ELPA1_compute
integer
(
kind
=
ik
)
::
na
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
real
(
kind
=
rk
)
::
a
(
lda
,
matrixCols
)
! was
! real a(lda,*)
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
integer
(
kind
=
ik
)
::
l_cols
,
l_rows
,
l_col1
,
l_row1
,
l_colx
,
l_rowx
...
...
@@ -3626,6 +3644,8 @@ module ELPA1_compute
integer
(
kind
=
ik
)
::
na
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
complex
(
kind
=
ck
)
::
a
(
lda
,
matrixCols
)
!was
! complex a(lda,*)
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
integer
(
kind
=
ik
)
::
l_cols
,
l_rows
,
l_col1
,
l_row1
,
l_colx
,
l_rowx
...
...
@@ -3803,6 +3823,8 @@ module ELPA1_compute
integer
(
kind
=
ik
)
::
na
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
complex
(
kind
=
ck
)
::
a
(
lda
,
matrixCols
)
! was
! complex a(lda,*)
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
integer
(
kind
=
ik
)
::
l_cols
,
l_rows
,
l_col1
,
l_row1
,
l_colx
,
l_rowx
...
...
src/elpa2.F90
View file @
a0934d4e
...
...
@@ -156,6 +156,8 @@ contains
mpi_comm_cols
,
mpi_comm_all
integer
(
kind
=
ik
),
intent
(
in
)
::
nblk
real
(
kind
=
rk
),
intent
(
inout
)
::
a
(
lda
,
matrixCols
),
ev
(
na
),
q
(
ldq
,
matrixCols
)
! was
! real a(lda,*), q(ldq,*)
real
(
kind
=
rk
),
allocatable
::
hh_trans_real
(:,:)
integer
(
kind
=
ik
)
::
my_pe
,
n_pes
,
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
...
...
@@ -429,6 +431,8 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
integer
(
kind
=
ik
)
::
THIS_COMPLEX_ELPA_KERNEL
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
nev
,
lda
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
mpi_comm_all
complex
(
kind
=
ck
),
intent
(
inout
)
::
a
(
lda
,
matrixCols
),
q
(
ldq
,
matrixCols
)
! was
! complex a(lda,*), q(ldq,*)
real
(
kind
=
rk
),
intent
(
inout
)
::
ev
(
na
)
complex
(
kind
=
ck
),
allocatable
::
hh_trans_complex
(:,:)
...
...
src/elpa2_compute.F90
View file @
a0934d4e
This diff is collapsed.
Click to expand it.
src/elpa2_kernels/mod_single_hh_trafo_real.F90
0 → 100644
View file @
a0934d4e
module
single_hh_trafo_real
implicit
none
#include "config-f90.h"
#ifdef WITH_OPENMP
public
single_hh_trafo_real_cpu_openmp
#else
public
single_hh_trafo_real_cpu
#endif
contains
#ifdef WITH_OPENMP
subroutine
single_hh_trafo_real_cpu_openmp
(
q
,
hh
,
nb
,
nq
,
ldq
)
#else
subroutine
single_hh_trafo_real_cpu
(
q
,
hh
,
nb
,
nq
,
ldq
)
#endif
#ifdef HAVE_DETAILED_TIMINGS
use
timings
#endif
use
precision
! Perform single real Householder transformation.
! This routine is not performance critical and thus it is coded here in Fortran
implicit
none
integer
(
kind
=
ik
),
intent
(
in
)
::
nb
,
nq
,
ldq
! real(kind=rk), intent(inout) :: q(ldq, *)
! real(kind=rk), intent(in) :: hh(*)
real
(
kind
=
rk
),
intent
(
inout
)
::
q
(
1
:
ldq
,
1
:
nb
)
real
(
kind
=
rk
),
intent
(
in
)
::
hh
(
1
:
nb
)
integer
(
kind
=
ik
)
::
i
real
(
kind
=
rk
)
::
v
(
nq
)
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call
timer
%
start
(
"single_hh_trafo_real_cpu_openmp"
)
#else
call
timer
%
start
(
"single_hh_trafo_real_cpu"
)
#endif
#endif
! v = q * hh
v
(:)
=
q
(
1
:
nq
,
1
)
do
i
=
2
,
nb
v
(:)
=
v
(:)
+
q
(
1
:
nq
,
i
)
*
hh
(
i
)
enddo
! v = v * tau
v
(:)
=
v
(:)
*
hh
(
1
)
! q = q - v * hh**T
q
(
1
:
nq
,
1
)
=
q
(
1
:
nq
,
1
)
-
v
(:)
do
i
=
2
,
nb
q
(
1
:
nq
,
i
)
=
q
(
1
:
nq
,
i
)
-
v
(:)
*
hh
(
i
)
enddo
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call
timer
%
stop
(
"single_hh_trafo_real_cpu_openmp"
)
#else
call
timer
%
stop
(
"single_hh_trafo_real_cpu"
)
#endif
#endif
end
subroutine
end
module
src/mod_compute_hh_trafo_complex.F90
0 → 100644
View file @
a0934d4e
module
compute_hh_trafo_complex
#include "config-f90.h"
implicit
none
#ifdef WITH_OPENMP
public
compute_hh_trafo_complex_cpu_openmp
#else
public
compute_hh_trafo_complex_cpu
#endif
include
'mpif.h'
contains
#ifdef WITH_OPENMP
subroutine
compute_hh_trafo_complex_cpu_openmp
(
a
,
stripe_width
,
a_dim2
,
stripe_count
,
max_threads
,
&
a_off
,
nbw
,
max_blk_size
,
bcast_buffer
,
kernel_flops
,
kernel_time
,
&
off
,
ncols
,
istripe
,
&
my_thread
,
THIS_COMPLEX_ELPA_KERNEL
)
#else
subroutine
compute_hh_trafo_complex_cpu
(
a
,
stripe_width
,
a_dim2
,
stripe_count
,
&
a_off
,
nbw
,
max_blk_size
,
bcast_buffer
,
kernel_flops
,
kernel_time
,
&
off
,
ncols
,
istripe
,
last_stripe_width
,
&
THIS_COMPLEX_ELPA_KERNEL
)
#endif
use
precision
use
elpa2_utilities
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
use
complex_generic_simple_kernel
,
only
:
single_hh_trafo_complex_generic_simple
#endif
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
use
complex_generic_kernel
,
only
:
single_hh_trafo_complex_generic
#endif
#ifdef HAVE_DETAILED_TIMINGS
use
timings
#endif
implicit
none
real
(
kind
=
rk
),
intent
(
inout
)
::
kernel_time
integer
(
kind
=
lik
)
::
kernel_flops
integer
(
kind
=
ik
),
intent
(
in
)
::
nbw
,
max_blk_size
complex
(
kind
=
ck
)
::
bcast_buffer
(
nbw
,
max_blk_size
)
integer
(
kind
=
ik
),
intent
(
in
)
::
a_off
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_width
,
a_dim2
,
stripe_count
#ifndef WITH_OPENMP
integer
(
kind
=
ik
),
intent
(
in
)
::
last_stripe_width
complex
(
kind
=
ck
)
::
a
(
stripe_width
,
a_dim2
,
stripe_count
)
#else
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
complex
(
kind
=
ck
)
::
a
(
stripe_width
,
a_dim2
,
stripe_count
,
max_threads
)
#endif
integer
(
kind
=
ik
),
intent
(
in
)
::
THIS_COMPLEX_ELPA_KERNEL
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer
(
kind
=
ik
)
::
off
,
ncols
,
istripe
,
j
,
nl
,
jj
#ifdef WITH_OPENMP
integer
(
kind
=
ik
)
::
my_thread
,
noff
#endif
real
(
kind
=
rk
)
::
ttt
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Currently (on Sandy Bridge), single is faster than double
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
complex
(
kind
=
ck
)
::
w
(
nbw
,
2
)
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call
timer
%
stop
(
"compute_hh_trafo_complex_cpu_openmp"
)
#else
call
timer
%
stop
(
"compute_hh_trafo_complex_cpu"
)
#endif
#endif
#ifdef WITH_OPENMP
if
(
istripe
<
stripe_count
)
then
nl
=
stripe_width
else
noff
=
(
my_thread
-1
)
*
thread_width
+
(
istripe
-1
)
*
stripe_width
nl
=
min
(
my_thread
*
thread_width
-
noff
,
l_nev
-
noff
)
if
(
nl
<=
0
)
then
#ifdef WITH_OPENMP
call
timer
%
stop
(
"compute_hh_trafo_complex_cpu_openmp"
)
#else
call
timer
%
stop
(
"compute_hh_trafo_complex_cpu"
)
#endif
return
endif
endif
#else
nl
=
merge
(
stripe_width
,
last_stripe_width
,
istripe
<
stripe_count
)
#endif
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
)
then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_complex_sse_avx_2hv
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
&
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_complex_sse_avx_2hv
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
),
&
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP
if
(
j
==
1
)
call
single_hh_trafo_complex_sse_avx_1hv
(
a
(
1
,
1
+
off
+
a_off
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
j
==
1
)
call
single_hh_trafo_complex_sse_avx_1hv
(
a
(
1
,
1
+
off
+
a_off
,
istripe
),
&
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL */
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
)
then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP
call
single_hh_trafo_complex_generic_simple
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_complex_generic_simple
(
a
(
1
,
j
+
off
+
a_off
,
istripe
),
&
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_GENERIC
.or.
&
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_BGP
.or.
&
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_BGQ
)
then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP
call
single_hh_trafo_complex_generic
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_complex_generic
(
a
(
1
,
j
+
off
+
a_off
,
istripe
),
&
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_KERNEL */
#if defined(WITH_COMPLEX_SSE_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_SSE
)
then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP
call
single_hh_trafo_complex
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_complex
(
a
(
1
,
j
+
off
+
a_off
,
istripe
),
&
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_KERNEL */
!#if defined(WITH_AVX_SANDYBRIDGE)
! call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
!#if defined(WITH_AMD_BULLDOZER)
! call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_AVX_BLOCK1
)
then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP
call
single_hh_trafo_complex_sse_avx_1hv
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_complex_sse_avx_1hv
(
a
(
1
,
j
+
off
+
a_off
,
istripe
),
&
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNE */
#ifdef WITH_OPENMP
if
(
my_thread
==
1
)
then
#endif
kernel_flops
=
kernel_flops
+
4
*
4
*
int
(
nl
,
8
)
*
int
(
ncols
,
8
)
*
int
(
nbw
,
8
)
kernel_time
=
kernel_time
+
mpi_wtime
()
-
ttt
#ifdef WITH_OPENMP
endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call
timer
%
stop
(
"compute_hh_trafo_complex_cpu_openmp"
)
#else
call
timer
%
stop
(
"compute_hh_trafo_complex_cpu"
)
#endif
#endif
#ifdef WITH_OPENM
end
subroutine
compute_hh_trafo_complex_cpu_openmp
#else
end
subroutine
compute_hh_trafo_complex_cpu
#endif
end
module
src/mod_compute_hh_trafo_real.F90
0 → 100644
View file @
a0934d4e
module
compute_hh_trafo_real
#include "config-f90.h"
implicit
none
#ifdef WITH_OPENMP
public
compute_hh_trafo_real_cpu_openmp
#else
public
compute_hh_trafo_real_cpu
#endif
include
'mpif.h'
contains
#ifdef WITH_OPENMP
subroutine
compute_hh_trafo_real_cpu_openmp
(
a
,
stripe_width
,
a_dim2
,
stripe_count
,
max_threads
,
&
a_off
,
nbw
,
max_blk_size
,
bcast_buffer
,
kernel_flops
,
kernel_time
,
&
off
,
ncols
,
istripe
,
&
my_thread
,
THIS_REAL_ELPA_KERNEL
)
#else
subroutine
compute_hh_trafo_real_cpu
(
a
,
stripe_width
,
a_dim2
,
stripe_count
,
&
a_off
,
nbw
,
max_blk_size
,
bcast_buffer
,
kernel_flops
,
kernel_time
,
&
off
,
ncols
,
istripe
,
last_stripe_width
,
&
THIS_REAL_ELPA_KERNEL
)
#endif
use
precision
use
elpa2_utilities
use
single_hh_trafo_real
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
use
real_generic_simple_kernel
,
only
:
double_hh_trafo_generic_simple
#endif
!#if defined(WITH_REAL_GENERIC_KERNEL)
! use real_generic_kernel, only : double_hh_trafo_generic
!#endif
#if defined(WITH_REAL_BGP_KERNEL)
use
real_bgp_kernel
,
only
:
double_hh_trafo_bgp
#endif
#if defined(WITH_REAL_BGQ_KERNEL)
use
real_bgq_kernel
,
only
:
double_hh_trafo_bgq
#endif
#ifdef HAVE_DETAILED_TIMINGS
use
timings
#endif
implicit
none
include
"mpif.h"
real
(
kind
=
rk
),
intent
(
inout
)
::
kernel_time
integer
(
kind
=
lik
)
::
kernel_flops
integer
(
kind
=
ik
),
intent
(
in
)
::
nbw
,
max_blk_size
real
(
kind
=
rk
)
::
bcast_buffer
(
nbw
,
max_blk_size
)
integer
(
kind
=
ik
),
intent
(
in
)
::
a_off
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_width
,
a_dim2
,
stripe_count
#ifndef WITH_OPENMP
integer
(
kind
=
ik
),
intent
(
in
)
::
last_stripe_width
real
(
kind
=
rk
)
::
a
(
stripe_width
,
a_dim2
,
stripe_count
)
#else
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
real
(
kind
=
rk
)
::
a
(
stripe_width
,
a_dim2
,
stripe_count
,
max_threads
)
#endif
integer
(
kind
=
ik
),
intent
(
in
)
::
THIS_REAL_ELPA_KERNEL
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer
(
kind
=
ik
)
::
off
,
ncols
,
istripe
#ifdef WITH_OPENMP
integer
(
kind
=
ik
)
::
my_thread
,
noff
#endif
integer
(
kind
=
ik
)
::
j
,
nl
,
jj
,
jjj
real
(
kind
=
rk
)
::
w
(
nbw
,
6
),
ttt
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call
timer
%
start
(
"compute_hh_trafo_real_cpu_openmp"
)
#else
call
timer
%
start
(
"compute_hh_trafo_real_cpu"
)
#endif
#endif
ttt
=
mpi_wtime
()
#ifndef WITH_OPENMP
nl
=
merge
(
stripe_width
,
last_stripe_width
,
istripe
<
stripe_count
)
#else
if
(
istripe
<
stripe_count
)
then
nl
=
stripe_width
else
noff
=
(
my_thread
-1
)
*
thread_width
+
(
istripe
-1
)
*
stripe_width
nl
=
min
(
my_thread
*
thread_width
-
noff
,
l_nev
-
noff
)
if
(
nl
<=
0
)
then
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call
timer
%
stop
(
"compute_hh_trafo_real_cpu_openmp"
)
#else
call
timer
%
stop
(
"compute_hh_trafo_real_cpu"
)
#endif
#endif
return
endif
endif
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if
(
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_AVX_BLOCK2
.or.
&
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_GENERIC
.or.
&
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_GENERIC_SIMPLE
.or.
&
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_SSE
.or.
&
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_BGP
.or.
&
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_BGQ
)
then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
!FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
#if defined(WITH_REAL_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if
(
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_GENERIC
)
then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_generic
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
w
,
&