Commit c111ef44 authored by Andreas Marek's avatar Andreas Marek

Make double precision real Power8 kernels available

parent 44098022
......@@ -687,6 +687,8 @@ EXTRA_DIST = \
src/elpa2/kernels/real_avx512_4hv_template.c \
src/elpa2/kernels/real_avx512_6hv_template.c \
src/elpa2/kernels/real_vsx_2hv_template.c \
src/elpa2/kernels/real_vsx_4hv_template.c \
src/elpa2/kernels/real_vsx_6hv_template.c \
src/elpa2/kernels/real_sse_2hv_template.c \
src/elpa2/kernels/real_sse_4hv_template.c \
src/elpa2/kernels/real_sse_6hv_template.c \
......
......@@ -478,8 +478,6 @@ m4_define(elpa_m4_vsx_kernels, [
real_vsx_block2
real_vsx_block4
real_vsx_block6
complex_vsx_block1
complex_vsx_block2
])
m4_define(elpa_m4_avx_kernels, [
......
......@@ -74,9 +74,7 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1, 15, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2, 16, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_VSX_BLOCK1, 17, @ELPA_2STAGE_COMPLEX_VSX_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_VSX_BLOCK2, 18, @ELPA_2STAGE_COMPLEX_VSX_BLOCK2_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2, 16, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
......@@ -663,13 +663,13 @@
! vsx block1 complex kernel
#if defined(WITH_COMPLEX_VSX_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
ttt = mpi_wtime()
do j = ncols, 1, -1
!#ifndef WITH_FIXED_COMPLEX_KERNEL
! if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
! ttt = mpi_wtime()
! do j = ncols, 1, -1
!#ifdef WITH_OPENMP
! call single_hh_trafo_&
! &MATH_DATATYPE&
......@@ -683,12 +683,12 @@
! &PRECISION&
! & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
! enddo
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
! endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_VSX_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */
......@@ -965,14 +965,14 @@
! implementation of vsx block 2 complex case
#if defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
!#ifndef WITH_FIXED_COMPLEX_KERNEL
! if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
! ttt = mpi_wtime()
! do j = ncols, 2, -2
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
!#ifdef WITH_OPENMP
! call double_hh_trafo_&
! &MATH_DATATYPE&
......@@ -986,7 +986,7 @@
! &PRECISION&
! & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
!#endif
enddo
! enddo
!#ifdef WITH_OPENMP
! if (j==1) call single_hh_trafo_&
! &MATH_DATATYPE&
......@@ -1000,10 +1000,10 @@
! &PRECISION&
! & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
! endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_VSX_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */
......
......@@ -172,7 +172,7 @@
if (gpu == 1) then
if (kernel .ne. ELPA_2STAGE_REAL_GPU) then
write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
else if (nblk .ne. 128) then
kernel = ELPA_2STAGE_REAL_GENERIC
endif
......@@ -182,6 +182,18 @@
write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
endif
endif
#ifdef SINGLE_PRECISION_REAL
! special case at the moment NO single precision kernels on POWER 8 -> set GENERIC for now
if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK6 ) then
write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for POWER8"
write(error_unit,*) "The GENERIC kernel will be used at the moment"
kernel = ELPA_2STAGE_REAL_GENERIC
endif
#endif
#endif
#if COMPLEXCASE == 1
......@@ -191,7 +203,7 @@
if (gpu == 1) then
if (kernel .ne. ELPA_2STAGE_COMPLEX_GPU) then
write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
else if (nblk .ne. 128) then
kernel = ELPA_2STAGE_COMPLEX_GENERIC
endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment