Commit a97b1bf5 authored by Alexander Heinecke's avatar Alexander Heinecke
Browse files

added SSE,AVX,FMA4 tri to band real kernels for x86 based systems

parent 098c4139
......@@ -1527,6 +1527,7 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows
a_dim2 = max_blk_size + nbw
!DEC$ ATTRIBUTES ALIGN: 64:: a
allocate(a(stripe_width,a_dim2,stripe_count,max_threads))
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
......@@ -2014,8 +2015,8 @@ contains
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer, intent(in) :: off, ncols, istripe, my_thread
integer j, nl, noff
real*8 w(nbw,2), ttt
integer j, jj, jjj, nl, noff
real*8 w(nbw,6), ttt
ttt = mpi_wtime()
if(istripe<stripe_count) then
......@@ -2025,12 +2026,86 @@ contains
nl = min(my_thread*thread_width-noff, l_nev-noff)
if(nl<=0) return
endif
!FORTRAN CODE
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
call double_hh_trafo(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
enddo
if(j==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!INTRINSIC CODE, USING 2 HOUSEHOLDER VECTORS
!do j = ncols, 2, -2
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! if (mod(nl,24) == 0) then
! call double_hh_trafo_2hv_fast(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_2hv(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!if(j==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
!do j = ncols, 4, -4
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! if (mod(nl,12) == 0) then
! call double_hh_trafo_4hv_fast(a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!do jj = j, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! if (mod(nl,24) == 0) then
! call double_hh_trafo_2hv_fast(a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_2hv(a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!if(jj==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
!do j = ncols, 6, -6
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! w(:,5) = bcast_buffer(1:nbw,j+off-4)
! w(:,6) = bcast_buffer(1:nbw,j+off-5)
! if (mod(nl,8) == 0) then
! call double_hh_trafo_6hv_fast(a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!do jj = j, 4, -4
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! w(:,3) = bcast_buffer(1:nbw,jj+off-2)
! w(:,4) = bcast_buffer(1:nbw,jj+off-3)
! if (mod(nl,12) == 0) then
! call double_hh_trafo_4hv_fast(a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!do jjj = jj, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jjj+off)
! w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
! if (mod(nl,24) == 0) then
! call double_hh_trafo_2hv_fast(a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!if(jjj==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
if(my_thread==1) then
kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
kernel_time = kernel_time + mpi_wtime()-ttt
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# ------------------------------------------------------------------------------
# Please set the variables below according to your system!
# ------------------------------------------------------------------------------
# Settings for Intel Fortran (Linux):
# Settings for Intel Fortran (Linux), Intel Composer XE 2011 (ifort 12.1) with AVX for Sandy Bridge:
#
#F90=mpiifort -O3 -traceback -g -fpe0
#CC=mpiicc -O3
#F90OPT=$(F90) -mavx
#CCOPT=$(CC) -mavx
#MKL_HOME=/opt/intel/mkl/lib/intel64
#LIBS = -mkl -L$(MKL_HOME) -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
#
# ------------------------------------------------------------------------------
# Settings for Intel Fortran (Linux), Intel Composer XE 2011 (ifort 12.1) with SSE3:
#
F90=mpiifort -O3 -traceback -g -fpe0
CC=mpiicc -O3
F90OPT=$(F90) -msse3
CCOPT=$(CC) -msse3
MKL_HOME=/opt/intel/mkl/lib/intel64
LIBS = -mkl -L$(MKL_HOME) -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
#
# ------------------------------------------------------------------------------
# Settings for Intel Fortran (Linux), Intel Composer XE 2011 (ifort 12.1) and GCC 4.6 with FMA4 for AMD Bulldozer:
#
#F90=mpiifort -O3 -traceback -g -fpe0
#CC=gcc -O3
#F90OPT=$(F90) -msse3
#CCOPT=$(CC) -mfma4 -mxop -march=bdver1 -D__USE_AVX128__
#LIBS = -L/opt/acml5.0.0/gfortran64_fma4/lib/ -lacml -lgfortran libscalapack.a
#
# ------------------------------------------------------------------------------
# Settings for Intel Fortran (Linux) old 11.x Toolchain, do not use:
#
#F90=mpif90 -O3 -traceback -g -fpe0
#F90OPT=$(F90) -xSSE4.2
......@@ -25,7 +54,7 @@
#LIBS = -L/usr/local/lib -lscalapack -llapack-essl -lessl -lblacsF77init -lblacs -lblacsF77init -lblacs -lc
#
# ------------------------------------------------------------------------------
# Settings for IBM BlueGene/P
# Settings for IBM AIX BlueGene
#
F90 = mpixlf95_r -O3 -g -qarch=auto -qtune=auto
F90OPT = mpixlf95_r -O4 -g -qarch=auto -qtune=auto
......@@ -54,8 +83,14 @@ read_real_gen: read_real_gen.o elpa1.o
test_complex_gen: test_complex_gen.o read_test_parameters.o elpa1.o
$(F90) -o $@ test_complex_gen.o read_test_parameters.o elpa1.o $(LIBS)
#test_real2: test_real2.o elpa1.o elpa2.o read_test_parameters.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o elpa2_tum_kernels_sse-avx_2hv.o elpa2_tum_kernels_sse-avx_4hv.o elpa2_tum_kernels_sse-avx_6hv.o
# $(F90) -o $@ test_real2.o elpa1.o elpa2.o read_test_parameters.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o elpa2_tum_kernels_sse-avx_2hv.o elpa2_tum_kernels_sse-avx_4hv.o elpa2_tum_kernels_sse-avx_6hv.o $(LIBS)
#test_complex2: test_complex2.o read_test_parameters.o elpa1.o elpa2.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o elpa2_tum_kernels_sse-avx_2hv.o elpa2_tum_kernels_sse-avx_4hv.o elpa2_tum_kernels_sse-avx_6hv.o
# $(F90) -o $@ test_complex2.o read_test_parameters.o elpa1.o elpa2.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o elpa2_tum_kernels_sse-avx_2hv.o elpa2_tum_kernels_sse-avx_4hv.o elpa2_tum_kernels_sse-avx_6hv.o $(LIBS)
test_real2: test_real2.o elpa1.o elpa2.o read_test_parameters.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o
$(F90) -o $@ test_real2.o read_test_parameters.o elpa1.o elpa2.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o $(LIBS)
$(F90) -o $@ test_real2.o elpa1.o elpa2.o read_test_parameters.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o $(LIBS)
test_complex2: test_complex2.o read_test_parameters.o elpa1.o elpa2.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o
$(F90) -o $@ test_complex2.o read_test_parameters.o elpa1.o elpa2.o elpa2_kernels.o elpa_pdgeqrf.o elpa_pdlarfb.o elpa_qrkernels.o tum_utils.o $(LIBS)
......@@ -107,6 +142,15 @@ elpa2.o: ../src/elpa2.f90 elpa1.o elpa_pdgeqrf.o
elpa2_kernels.o: ../src/elpa2_kernels.f90
$(F90OPT) -c ../src/elpa2_kernels.f90
elpa2_tum_kernels_sse-avx_2hv.o: ../src/elpa2_tum_kernels_sse-avx_2hv.c
$(CCOPT) -c ../src/elpa2_tum_kernels_sse-avx_2hv.c
elpa2_tum_kernels_sse-avx_4hv.o: ../src/elpa2_tum_kernels_sse-avx_4hv.c
$(CCOPT) -c ../src/elpa2_tum_kernels_sse-avx_4hv.c
elpa2_tum_kernels_sse-avx_6hv.o: ../src/elpa2_tum_kernels_sse-avx_6hv.c
$(CCOPT) -c ../src/elpa2_tum_kernels_sse-avx_6hv.c
clean:
rm -f *.o *.mod test_real test_complex test_real_gen test_complex_gen test_real2 test_complex2 read_real read_real_gen read_test_parameters.o
rm -f *.o *.mod test_real test_complex test_real_gen test_complex_gen test_real2 test_complex2 read_real read_real_gen read_test_parameters.o
......@@ -26,7 +26,8 @@ program test_real2
! nblk: Blocking factor in block cyclic distribution
!-------------------------------------------------------------------------------
integer :: na = 4000, nev = 1500, nblk = 16
integer, parameter :: nblk = 16
integer na, nev
!-------------------------------------------------------------------------------
! Local Variables
......@@ -43,6 +44,21 @@ program test_real2
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
!-------------------------------------------------------------------------------
! Pharse command line argumnents, if given
character*16 arg1
character*16 arg2
na = 12059
nev = 3401
if (iargc() == 2) then
call getarg(1, arg1)
call getarg(2, arg2)
read(arg1, *) na
read(arg2, *) nev
endif
!-------------------------------------------------------------------------------
! MPI Initialization
......@@ -50,14 +66,6 @@ program test_real2
call mpi_comm_rank(mpi_comm_world,myid,mpierr)
call mpi_comm_size(mpi_comm_world,nprocs,mpierr)
!-------------------------------------------------------------------------------
! Reading of test parameters (matrix size, number of requested eigenvalue/eigenvector
! pairs, block size) from a file 'test_parameters.in', if that file exists.
! We only read on mpi task number myid = 0 to avoid any possible confusion.
! The parameters of interest are subsequently broadcast to all other mpi tasks.
call read_test_parameters (na,nev,nblk,myid,mpi_comm_world)
!-------------------------------------------------------------------------------
! Selection of number of processor rows/columns
! We try to set up the grid square-like, i.e. start the search for possible
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment