Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
a97b1bf5
Commit
a97b1bf5
authored
Jul 11, 2012
by
Alexander Heinecke
Browse files
added SSE,AVX,FMA4 tri to band real kernels for x86 based systems
parent
098c4139
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
ELPA_development_version/src/elpa2.f90
View file @
a97b1bf5
...
...
@@ -1527,6 +1527,7 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows
a_dim2
=
max_blk_size
+
nbw
!DEC$ ATTRIBUTES ALIGN: 64:: a
allocate
(
a
(
stripe_width
,
a_dim2
,
stripe_count
,
max_threads
))
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
...
...
@@ -2014,8 +2015,8 @@ contains
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer
,
intent
(
in
)
::
off
,
ncols
,
istripe
,
my_thread
integer
j
,
nl
,
noff
real
*
8
w
(
nbw
,
2
),
ttt
integer
j
,
jj
,
jjj
,
nl
,
noff
real
*
8
w
(
nbw
,
6
),
ttt
ttt
=
mpi_wtime
()
if
(
istripe
<
stripe_count
)
then
...
...
@@ -2025,12 +2026,86 @@ contains
nl
=
min
(
my_thread
*
thread_width
-
noff
,
l_nev
-
noff
)
if
(
nl
<=
0
)
return
endif
!FORTRAN CODE
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
call
double_hh_trafo
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
enddo
if
(
j
==
1
)
call
single_hh_trafo
(
a
(
1
,
1
+
off
+
a_off
,
istripe
,
my_thread
),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
!INTRINSIC CODE, USING 2 HOUSEHOLDER VECTORS
!do j = ncols, 2, -2
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! if (mod(nl,24) == 0) then
! call double_hh_trafo_2hv_fast(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_2hv(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!if(j==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
!do j = ncols, 4, -4
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! if (mod(nl,12) == 0) then
! call double_hh_trafo_4hv_fast(a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!do jj = j, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! if (mod(nl,24) == 0) then
! call double_hh_trafo_2hv_fast(a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_2hv(a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!if(jj==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
!do j = ncols, 6, -6
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! w(:,5) = bcast_buffer(1:nbw,j+off-4)
! w(:,6) = bcast_buffer(1:nbw,j+off-5)
! if (mod(nl,8) == 0) then
! call double_hh_trafo_6hv_fast(a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!do jj = j, 4, -4
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! w(:,3) = bcast_buffer(1:nbw,jj+off-2)
! w(:,4) = bcast_buffer(1:nbw,jj+off-3)
! if (mod(nl,12) == 0) then
! call double_hh_trafo_4hv_fast(a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!do jjj = jj, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jjj+off)
! w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
! if (mod(nl,24) == 0) then
! call double_hh_trafo_2hv_fast(a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!if(jjj==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
if
(
my_thread
==
1
)
then
kernel_flops
=
kernel_flops
+
4
*
int
(
nl
,
8
)
*
int
(
ncols
,
8
)
*
int
(
nbw
,
8
)
kernel_time
=
kernel_time
+
mpi_wtime
()
-
ttt
...
...
ELPA_development_version/src/elpa2_tum_kernels_sse-avx_2hv.c
0 → 100644
View file @
a97b1bf5
This diff is collapsed.
Click to expand it.
ELPA_development_version/src/elpa2_tum_kernels_sse-avx_4hv.c
0 → 100644
View file @
a97b1bf5
This diff is collapsed.
Click to expand it.
ELPA_development_version/src/elpa2_tum_kernels_sse-avx_6hv.c
0 → 100644
View file @
a97b1bf5
This diff is collapsed.
Click to expand it.
ELPA_development_version/test/Makefile
View file @
a97b1bf5
# ------------------------------------------------------------------------------
# Please set the variables below according to your system!
# ------------------------------------------------------------------------------
# Settings for Intel Fortran (Linux):
# Settings for Intel Fortran (Linux), Intel Composer XE 2011 (ifort 12.1) with AVX for Sandy Bridge:
#
#F90=mpiifort -O3 -traceback -g -fpe0
#CC=mpiicc -O3
#F90OPT=$(F90) -mavx
#CCOPT=$(CC) -mavx
#MKL_HOME=/opt/intel/mkl/lib/intel64
#LIBS = -mkl -L$(MKL_HOME) -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
#
# ------------------------------------------------------------------------------
# Settings for Intel Fortran (Linux), Intel Composer XE 2011 (ifort 12.1) with SSE3:
#
F90
=
mpiifort
-O3
-traceback
-g
-fpe0
CC
=
mpiicc
-O3
F90OPT
=
$(F90)
-msse3
CCOPT
=
$(CC)
-msse3
MKL_HOME
=
/opt/intel/mkl/lib/intel64
LIBS
=
-mkl
-L
$(MKL_HOME)
-lmkl_scalapack_lp64
-lmkl_blacs_intelmpi_lp64
#
# ------------------------------------------------------------------------------
# Settings for Intel Fortran (Linux), Intel Composer XE 2011 (ifort 12.1) and GCC 4.6 with FMA4 for AMD Bulldozer:
#
#F90=mpiifort -O3 -traceback -g -fpe0
#CC=gcc -O3
#F90OPT=$(F90) -msse3
#CCOPT=$(CC) -mfma4 -mxop -march=bdver1 -D__USE_AVX128__
#LIBS = -L/opt/acml5.0.0/gfortran64_fma4/lib/ -lacml -lgfortran libscalapack.a
#
# ------------------------------------------------------------------------------
# Settings for Intel Fortran (Linux) old 11.x Toolchain, do not use:
#
#F90=mpif90 -O3 -traceback -g -fpe0
#F90OPT=$(F90) -xSSE4.2
...
...
@@ -25,7 +54,7 @@
#LIBS = -L/usr/local/lib -lscalapack -llapack-essl -lessl -lblacsF77init -lblacs -lblacsF77init -lblacs -lc
#
# ------------------------------------------------------------------------------
# Settings for IBM BlueGene
/P
# Settings for IBM
AIX
BlueGene
#
F90
=
mpixlf95_r
-O3
-g
-qarch
=
auto
-qtune
=
auto
F90OPT
=
mpixlf95_r
-O4
-g
-qarch
=
auto
-qtune
=
auto
...
...
@@ -54,8 +83,14 @@ read_real_gen: read_real_gen.o elpa1.o
test_complex_gen
:
test_complex_gen.o read_test_parameters.o elpa1.o
$(F90)
-o
$@
test_complex_gen.o read_test_parameters.o elpa1.o
$(LIBS)
#test_real2: test_real2.o elpa1.o elpa2.o read_test_parameters.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o elpa2_tum_kernels_sse-avx_2hv.o elpa2_tum_kernels_sse-avx_4hv.o elpa2_tum_kernels_sse-avx_6hv.o
# $(F90) -o $@ test_real2.o elpa1.o elpa2.o read_test_parameters.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o elpa2_tum_kernels_sse-avx_2hv.o elpa2_tum_kernels_sse-avx_4hv.o elpa2_tum_kernels_sse-avx_6hv.o $(LIBS)
#test_complex2: test_complex2.o read_test_parameters.o elpa1.o elpa2.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o elpa2_tum_kernels_sse-avx_2hv.o elpa2_tum_kernels_sse-avx_4hv.o elpa2_tum_kernels_sse-avx_6hv.o
# $(F90) -o $@ test_complex2.o read_test_parameters.o elpa1.o elpa2.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o elpa2_tum_kernels_sse-avx_2hv.o elpa2_tum_kernels_sse-avx_4hv.o elpa2_tum_kernels_sse-avx_6hv.o $(LIBS)
test_real2
:
test_real2.o elpa1.o elpa2.o read_test_parameters.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o
$(F90)
-o
$@
test_real2.o read_test_parameters.o
elpa1.o elpa2.o
elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o
$(LIBS)
$(F90)
-o
$@
test_real2.o
elpa1.o elpa2.o
read_test_parameters.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o
$(LIBS)
test_complex2
:
test_complex2.o read_test_parameters.o elpa1.o elpa2.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o
$(F90)
-o
$@
test_complex2.o read_test_parameters.o elpa1.o elpa2.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o
$(LIBS)
...
...
@@ -107,6 +142,15 @@ elpa2.o: ../src/elpa2.f90 elpa1.o elpa_pdgeqrf.o
elpa2_kernels.o
:
../src/elpa2_kernels.f90
$(F90OPT)
-c
../src/elpa2_kernels.f90
elpa2_tum_kernels_sse-avx_2hv.o
:
../src/elpa2_tum_kernels_sse-avx_2hv.c
$(CCOPT)
-c
../src/elpa2_tum_kernels_sse-avx_2hv.c
elpa2_tum_kernels_sse-avx_4hv.o
:
../src/elpa2_tum_kernels_sse-avx_4hv.c
$(CCOPT)
-c
../src/elpa2_tum_kernels_sse-avx_4hv.c
elpa2_tum_kernels_sse-avx_6hv.o
:
../src/elpa2_tum_kernels_sse-avx_6hv.c
$(CCOPT)
-c
../src/elpa2_tum_kernels_sse-avx_6hv.c
clean
:
rm
-f
*
.o
*
.mod test_real test_complex test_real_gen test_complex_gen test_real2 test_complex2 read_real read_real_gen read_test_parameters.o
rm
-f
*
.o
*
.mod test_real test_complex test_real_gen test_complex_gen test_real2 test_complex2 read_real read_real_gen read_test_parameters.o
ELPA_development_version/test/test_real2.f90
View file @
a97b1bf5
...
...
@@ -26,7 +26,8 @@ program test_real2
! nblk: Blocking factor in block cyclic distribution
!-------------------------------------------------------------------------------
integer
::
na
=
4000
,
nev
=
1500
,
nblk
=
16
integer
,
parameter
::
nblk
=
16
integer
na
,
nev
!-------------------------------------------------------------------------------
! Local Variables
...
...
@@ -43,6 +44,21 @@ program test_real2
integer
::
iseed
(
4096
)
! Random seed, size should be sufficient for every generator
!-------------------------------------------------------------------------------
! Pharse command line argumnents, if given
character
*
16
arg1
character
*
16
arg2
na
=
12059
nev
=
3401
if
(
iargc
()
==
2
)
then
call
getarg
(
1
,
arg1
)
call
getarg
(
2
,
arg2
)
read
(
arg1
,
*
)
na
read
(
arg2
,
*
)
nev
endif
!-------------------------------------------------------------------------------
! MPI Initialization
...
...
@@ -50,14 +66,6 @@ program test_real2
call
mpi_comm_rank
(
mpi_comm_world
,
myid
,
mpierr
)
call
mpi_comm_size
(
mpi_comm_world
,
nprocs
,
mpierr
)
!-------------------------------------------------------------------------------
! Reading of test parameters (matrix size, number of requested eigenvalue/eigenvector
! pairs, block size) from a file 'test_parameters.in', if that file exists.
! We only read on mpi task number myid = 0 to avoid any possible confusion.
! The parameters of interest are subsequently broadcast to all other mpi tasks.
call
read_test_parameters
(
na
,
nev
,
nblk
,
myid
,
mpi_comm_world
)
!-------------------------------------------------------------------------------
! Selection of number of processor rows/columns
! We try to set up the grid square-like, i.e. start the search for possible
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment