Commit 51d363ff authored by Andreas Marek's avatar Andreas Marek

REAL_SIMPLE_BLOCK4 kernel

parent 25c74724
......@@ -108,6 +108,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2/kernels/real_template.F90 \
src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \
src/elpa2/pack_unpack_cpu.F90 \
src/elpa2/pack_unpack_gpu.F90 \
src/elpa2/compute_hh_trafo.F90 \
......@@ -188,6 +189,10 @@ if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_simple.F90
endif
if WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block4.F90
endif
if WITH_REAL_BGP_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_bgp.f90
endif
......@@ -780,6 +785,7 @@ EXTRA_DIST = \
src/elpa2/kernels/real_sse_6hv_template.c \
src/elpa2/kernels/real_template.F90 \
src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \
src/elpa2/pack_unpack_cpu.F90 \
src/elpa2/pack_unpack_gpu.F90 \
src/elpa2/qr/elpa_pdgeqrf_template.F90 \
......
......@@ -549,6 +549,7 @@ m4_pattern_forbid([elpa_m4])
m4_define(elpa_m4_generic_kernels, [
real_generic
real_generic_simple
real_generic_simple_block4
complex_generic
complex_generic_simple
])
......
......@@ -46,7 +46,8 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 25, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......
......@@ -77,6 +77,10 @@
use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
#endif
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use real_generic_simple_block4_kernel !, only : double_hh_trafo_generic_simple
#endif
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use real_generic_kernel !, only : double_hh_trafo_generic
#endif
......@@ -1307,7 +1311,120 @@
#endif /* REALCASE == 1 */
#if REALCASE == 1
! sparc64 block4 real kernel
! generic simple block4 real kernel
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL))
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_4hv_&
&PRECISION&
& (a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_4hv_&
&PRECISION&
& (a(1:stripe_wdith,j+off+a_off-3:j+off+a_off+nbw-1,istripe,my_thread), w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
#endif
#else
#ifdef USE_ASSUMED_SIZE
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_4hv_&
&PRECISION&
& (a(1,j+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_4hv_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe), w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
#endif
#endif
enddo
do jj = j, 2, -2
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe,my_thread), w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
#endif
#else
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,jj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe), w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
#endif
#endif
enddo
#ifdef WITH_OPENMP
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1
! generic simple block4 real kernel
#if defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
......@@ -1316,7 +1433,6 @@
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
......@@ -1377,6 +1493,7 @@
#endif /* REALCASE */
#if REALCASE == 1
! vsx block4 real kernel
......
This diff is collapsed.
......@@ -56,7 +56,7 @@ module elpa2_utilities
implicit none
public
integer(kind=c_int), parameter :: number_of_real_kernels = ELPA_2STAGE_NUMBER_OF_REAL_KERNELS - 6
integer(kind=c_int), parameter :: number_of_real_kernels = ELPA_2STAGE_NUMBER_OF_REAL_KERNELS - 7
integer(kind=c_int), parameter :: number_of_complex_kernels = ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS
#ifdef WITH_REAL_GENERIC_KERNEL
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment