Commit ab06e091 authored by Andreas Marek's avatar Andreas Marek
Browse files

Real block2 double-precision kernel for K-Computer

parent f91ee283
...@@ -202,6 +202,13 @@ endif ...@@ -202,6 +202,13 @@ endif
endif endif
endif endif
if WITH_REAL_SPARC64_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_2hv_single_precision.c
endif
endif
if WITH_REAL_SSE_BLOCK2_KERNEL if WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_2hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
...@@ -231,6 +238,13 @@ endif ...@@ -231,6 +238,13 @@ endif
endif endif
if WITH_REAL_SPARC64_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
endif
endif
if WITH_REAL_SSE_BLOCK4_KERNEL if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_4hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
...@@ -259,6 +273,12 @@ if WANT_SINGLE_PRECISION_REAL ...@@ -259,6 +273,12 @@ if WANT_SINGLE_PRECISION_REAL
endif endif
endif endif
if WITH_REAL_SPARC64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
endif
endif
if WITH_REAL_SSE_BLOCK6_KERNEL if WITH_REAL_SSE_BLOCK6_KERNEL
...@@ -290,6 +310,13 @@ endif ...@@ -290,6 +310,13 @@ endif
endif endif
if WITH_COMPLEX_SPARC64_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_single_precision.c
endif
endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_1hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX if WANT_SINGLE_PRECISION_COMPLEX
...@@ -319,6 +346,13 @@ if WANT_SINGLE_PRECISION_COMPLEX ...@@ -319,6 +346,13 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif endif
endif endif
if WITH_COMPLEX_SPARC64_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_single_precision.c
endif
endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_2hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX if WANT_SINGLE_PRECISION_COMPLEX
......
...@@ -466,6 +466,14 @@ m4_define(elpa_m4_sse_kernels, [ ...@@ -466,6 +466,14 @@ m4_define(elpa_m4_sse_kernels, [
complex_sse_block2 complex_sse_block2
]) ])
m4_define(elpa_m4_sparc64_kernels, [
real_sparc64_block2
real_sparc64_block4
real_sparc64_block6
complex_sparc64_block1
complex_sparc64_block2
])
m4_define(elpa_m4_avx_kernels, [ m4_define(elpa_m4_avx_kernels, [
real_avx_block2 real_avx_block2
real_avx_block4 real_avx_block4
...@@ -505,7 +513,7 @@ m4_define(elpa_m4_gpu_kernels, [ ...@@ -505,7 +513,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu complex_gpu
]) ])
m4_define(elpa_m4_kernel_types, [generic sse sse_assembly avx avx2 avx512 bgp bgq gpu]) m4_define(elpa_m4_kernel_types, [generic sparc64 sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels, m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type], m4_foreach_w([elpa_m4_type],
...@@ -538,6 +546,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [ ...@@ -538,6 +546,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [
dnl Modify list of kernels with configure arguments dnl Modify list of kernels with configure arguments
ELPA_SELECT_KERNELS([generic],[enable]) ELPA_SELECT_KERNELS([generic],[enable])
ELPA_SELECT_KERNELS([sparc64],[disable])
ELPA_SELECT_KERNELS([sse],[enable]) ELPA_SELECT_KERNELS([sse],[enable])
ELPA_SELECT_KERNELS([sse_assembly],[enable]) ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable]) ELPA_SELECT_KERNELS([avx],[enable])
...@@ -552,7 +561,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ...@@ -552,7 +561,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
]) ])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [ m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option" echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi fi
...@@ -612,7 +621,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[ ...@@ -612,7 +621,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
]) ])
fi fi
]) ])
m4_foreach_w([elpa_m4_arch],[sse avx avx2 avx512],[ m4_foreach_w([elpa_m4_arch],[sparc64 sse avx avx2 avx512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2]) ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2]) ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1]) ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
...@@ -646,7 +655,7 @@ dnl choosing a default kernel ...@@ -646,7 +655,7 @@ dnl choosing a default kernel
m4_foreach_w([elpa_m4_kind],[real complex],[ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel], m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel], m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_generic_kernels, elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_generic_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ), [m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[ [
if test -z "$default_[]elpa_m4_kind[]_kernel"; then if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...@@ -664,7 +673,33 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ...@@ -664,7 +673,33 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
AC_SUBST([ELPA_2STAGE_]m4_toupper(elpa_m4_kind)[_DEFAULT]) AC_SUBST([ELPA_2STAGE_]m4_toupper(elpa_m4_kind)[_DEFAULT])
]) ])
dnl #include <fjmfunc.h>
dnl #include <emmintrin.h>
dnl int main(int argc, char **argv) {
dnl __m128d q;
dnl __m128d h1 = _fjsp_neg_v2r8(q);
dnl return 0;
dnl }
AC_LANG_PUSH([C]) AC_LANG_PUSH([C])
if test x"${need_sparc64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv) {
double* q;
__m128d h1 = _mm_loaddup_pd(q);
return 0;
}
])],
[can_compile_sparc64=yes],
[can_compile_sparc64=no]
)
AC_MSG_RESULT([${can_compile_sparc64}])
if test x"$can_compile_sparc64" != x"yes"; then
AC_MSG_ERROR([Could not compile test program, try with --disable-sparc64, or adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_SPARC64_SSE],[1],[SPARC64 intrinsics are supported on this CPU])
fi
if test x"${need_sse}" = x"yes"; then if test x"${need_sse}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C) AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C)
......
...@@ -40,7 +40,10 @@ enum ELPA_SOLVERS { ...@@ -40,7 +40,10 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_AVX512_BLOCK2, 15, @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_AVX512_BLOCK2, 15, @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \ #define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \ ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......
...@@ -313,21 +313,22 @@ ...@@ -313,21 +313,22 @@
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
if (wantDebug) then if (wantDebug) then
call obj%timer%stop("compute_hh_trafo: GPU") call obj%timer%stop("compute_hh_trafo: GPU")
endif endif
else ! not CUDA kernel else ! not CUDA kernel
if (wantDebug) then if (wantDebug) then
call obj%timer%start("compute_hh_trafo: CPU") call obj%timer%start("compute_hh_trafo: CPU")
endif endif
#if REALCASE == 1 #if REALCASE == 1
#ifndef WITH_FIXED_REAL_KERNEL #ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. & if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. & kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. & kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. & kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
...@@ -622,6 +623,41 @@ ...@@ -622,6 +623,41 @@
! no sse block1 real kernel ! no sse block1 real kernel
#endif #endif
#if COMPLEXCASE == 1
! sparc64 block1 complex kernel
#if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
! sse block1 complex kernel ! sse block1 complex kernel
...@@ -733,8 +769,43 @@ ...@@ -733,8 +769,43 @@
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
#if REALCASE == 1 #if REALCASE == 1
! implementation of sse block 2 real case ! implementation of sparc64 block 2 real case
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) #if defined(WITH_REAL_SPARC64_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_2hv_&
&PRECISION &
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_2hv_&
&PRECISION &
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */
#endif /* REALCASE == 1 */
#if REALCASE == 1
! implementation of sparc64 block 2 real case
#if defined(WITH_REAL_SPARC64_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL #ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2) then if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2) then
...@@ -768,6 +839,52 @@ ...@@ -768,6 +839,52 @@
#endif /* REALCASE == 1 */ #endif /* REALCASE == 1 */
#if COMPLEXCASE == 1
! implementation of sparc64 block 2 complex case
#if defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
! implementation of sse block 2 complex case ! implementation of sse block 2 complex case
...@@ -1065,6 +1182,77 @@ ...@@ -1065,6 +1182,77 @@
#endif /* not WITH_FIXED_REAL_KERNEL */ #endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* REALCASE == 1 */ #endif /* REALCASE == 1 */
#if REALCASE == 1
! sparc64 block4 real kernel
#if defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_4hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_4hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 2, -2
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_2hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_2hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SPARC64_BLOCK4_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1 #if REALCASE == 1
! sse block4 real kernel ! sse block4 real kernel
...@@ -1290,6 +1478,93 @@ ...@@ -1290,6 +1478,93 @@
!no avx512 block4 complex kernel !no avx512 block4 complex kernel
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
#if REALCASE == 1
!sparc64 block6 real kernel
#if defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6) then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
w(:,5) = bcast_buffer(1:nbw,j+off-4)
w(:,6) = bcast_buffer(1:nbw,j+off-5)
#ifdef WITH_OPENMP
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_6hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_6hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 4, -4
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
w(:,3) = bcast_buffer(1:nbw,jj+off-2)
w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_&
&MATH_DATATYPE&
&_sparc64__4hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_4hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
do jjj = jj, 2, -2
w(:,1) = bcast_buffer(1:nbw,jjj+off)
w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_2hv_&
&PRECISION&
& (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_2hv_&
&PRECISION&
& (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SPARC64_BLOCK6_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1 #if REALCASE == 1
!sse block6 real kernel !sse block6 real kernel
#if defined(WITH_REAL_SSE_BLOCK6_KERNEL) #if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
......
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,