Commit c38695bd authored by Andreas Marek's avatar Andreas Marek

Double precision real block6 kernel for Sparc64

parent 46ed16fd
...@@ -244,12 +244,12 @@ if WANT_SINGLE_PRECISION_REAL ...@@ -244,12 +244,12 @@ if WANT_SINGLE_PRECISION_REAL
endif endif
endif endif
#if WITH_REAL_SPARC64_BLOCK4_KERNEL if WITH_REAL_SPARC64_BLOCK4_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL #if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c # libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
#endif #endif
#endif endif
if WITH_REAL_VSX_BLOCK4_KERNEL if WITH_REAL_VSX_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
...@@ -288,9 +288,9 @@ endif ...@@ -288,9 +288,9 @@ endif
if WITH_REAL_SPARC64_BLOCK6_KERNEL if WITH_REAL_SPARC64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL #if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c # libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
endif #endif
endif endif
if WITH_REAL_VSX_BLOCK6_KERNEL if WITH_REAL_VSX_BLOCK6_KERNEL
...@@ -689,6 +689,9 @@ EXTRA_DIST = \ ...@@ -689,6 +689,9 @@ EXTRA_DIST = \
src/elpa2/kernels/real_vsx_2hv_template.c \ src/elpa2/kernels/real_vsx_2hv_template.c \
src/elpa2/kernels/real_vsx_4hv_template.c \ src/elpa2/kernels/real_vsx_4hv_template.c \
src/elpa2/kernels/real_vsx_6hv_template.c \ src/elpa2/kernels/real_vsx_6hv_template.c \
src/elpa2/kernels/real_sparc64_2hv_template.c \
src/elpa2/kernels/real_sparc64_4hv_template.c \
src/elpa2/kernels/real_sparc64_6hv_template.c \
src/elpa2/kernels/real_sse_2hv_template.c \ src/elpa2/kernels/real_sse_2hv_template.c \
src/elpa2/kernels/real_sse_4hv_template.c \ src/elpa2/kernels/real_sse_4hv_template.c \
src/elpa2/kernels/real_sse_6hv_template.c \ src/elpa2/kernels/real_sse_6hv_template.c \
......
...@@ -470,8 +470,6 @@ m4_define(elpa_m4_sparc64_kernels, [ ...@@ -470,8 +470,6 @@ m4_define(elpa_m4_sparc64_kernels, [
real_sparc64_block2 real_sparc64_block2
real_sparc64_block4 real_sparc64_block4
real_sparc64_block6 real_sparc64_block6
complex_sparc64_block1
complex_sparc64_block2
]) ])
m4_define(elpa_m4_vsx_kernels, [ m4_define(elpa_m4_vsx_kernels, [
......
...@@ -72,9 +72,7 @@ enum ELPA_REAL_KERNELS { ...@@ -72,9 +72,7 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1, 15, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2, 16, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \ #define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \ ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
...@@ -628,32 +628,32 @@ ...@@ -628,32 +628,32 @@
! sparc64 block1 complex kernel ! sparc64 block1 complex kernel
#if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL) #if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL !#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then ! if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */ !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) !#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
ttt = mpi_wtime() ! ttt = mpi_wtime()
do j = ncols, 1, -1 ! do j = ncols, 1, -1
#ifdef WITH_OPENMP !#ifdef WITH_OPENMP
call single_hh_trafo_& ! call single_hh_trafo_&
&MATH_DATATYPE& ! &MATH_DATATYPE&
&_sparc64_1hv_& ! &_sparc64_1hv_&
&PRECISION& ! &PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width) ! & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else !#else
call single_hh_trafo_& ! call single_hh_trafo_&
&MATH_DATATYPE& ! &MATH_DATATYPE&
&_sparc64_1hv_& ! &_sparc64_1hv_&
&PRECISION& ! &PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width) ! & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif !#endif
enddo ! enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */ !#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
!
#ifndef WITH_FIXED_COMPLEX_KERNEL !#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) ! endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */ !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */ #endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
...@@ -918,45 +918,45 @@ ...@@ -918,45 +918,45 @@
! implementation of sparc64 block 2 complex case ! implementation of sparc64 block 2 complex case
#if defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL) #if defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL !#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then ! if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */ !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
ttt = mpi_wtime() ! ttt = mpi_wtime()
do j = ncols, 2, -2 ! do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off) ! w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1) ! w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP !#ifdef WITH_OPENMP
call double_hh_trafo_& ! call double_hh_trafo_&
&MATH_DATATYPE& ! &MATH_DATATYPE&
&_sparc64_2hv_& ! &_sparc64_2hv_&
&PRECISION& ! &PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw) ! & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else !#else
call double_hh_trafo_& ! call double_hh_trafo_&
&MATH_DATATYPE& ! &MATH_DATATYPE&
&_sparc64_2hv_& ! &_sparc64_2hv_&
&PRECISION& ! &PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw) ! & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif !#endif
enddo ! enddo
#ifdef WITH_OPENMP !#ifdef WITH_OPENMP
if (j==1) call single_hh_trafo_& ! if (j==1) call single_hh_trafo_&
&MATH_DATATYPE& ! &MATH_DATATYPE&
&_sparc64_1hv_& ! &_sparc64_1hv_&
&PRECISION& ! &PRECISION&
& (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width) ! & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else !#else
if (j==1) call single_hh_trafo_& ! if (j==1) call single_hh_trafo_&
&MATH_DATATYPE& ! &MATH_DATATYPE&
&_sparc64_1hv_& ! &_sparc64_1hv_&
&PRECISION& ! &PRECISION&
& (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width) ! & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif !#endif
!
#ifndef WITH_FIXED_COMPLEX_KERNEL !#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) ! endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */ !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */ #endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */ #endif /* COMPLEXCASE == 1 */
......
...@@ -192,6 +192,14 @@ ...@@ -192,6 +192,14 @@
write(error_unit,*) "The GENERIC kernel will be used at the moment" write(error_unit,*) "The GENERIC kernel will be used at the moment"
kernel = ELPA_2STAGE_REAL_GENERIC kernel = ELPA_2STAGE_REAL_GENERIC
endif endif
! special case at the moment NO single precision kernels on SPARC64 -> set GENERIC for now
if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6 ) then
write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for SPARC64"
write(error_unit,*) "The GENERIC kernel will be used at the moment"
kernel = ELPA_2STAGE_REAL_GENERIC
endif
#endif #endif
#endif #endif
......
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
// //
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke // Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
// //
#include "config-f90.h"
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#include <x86intrin.h> #include <x86intrin.h>
...@@ -82,6 +83,7 @@ ...@@ -82,6 +83,7 @@
#undef __AVX__ #undef __AVX__
#endif #endif
#ifdef HAVE_SSE_INTRINSICS
//Forward declaration //Forward declaration
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s); __forceinline void hh_trafo_kernel_2_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
...@@ -99,14 +101,46 @@ __forceinline void hh_trafo_kernel_16_SSE_2hv_single(float* q, float* hh, int nb ...@@ -99,14 +101,46 @@ __forceinline void hh_trafo_kernel_16_SSE_2hv_single(float* q, float* hh, int nb
__forceinline void hh_trafo_kernel_20_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s); __forceinline void hh_trafo_kernel_20_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_24_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s); __forceinline void hh_trafo_kernel_24_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
#endif #endif
#endif
#ifdef HAVE_SPARC64_SSE
//Forward declaration
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_4_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_6_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_10_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_12_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_8_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_12_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_16_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_20_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_24_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
#endif
#endif
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
void double_hh_trafo_real_sse_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void double_hh_trafo_real_sse_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
void double_hh_trafo_real_sse_2hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); void double_hh_trafo_real_sse_2hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif #endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
void double_hh_trafo_real_sparc64_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
void double_hh_trafo_real_sparc64_2hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#endif
/* /*
!f>#ifdef HAVE_SPARC64_SSE !f>#ifdef HAVE_SPARC64_SSE
...@@ -210,14 +244,24 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -210,14 +244,24 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
for (i = 0; i < nq-10; i+=12) for (i = 0; i < nq-10; i+=12)
{ {
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_12_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_12_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_12_SPARC64_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 12; worked_on += 12;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-20; i+=24) for (i = 0; i < nq-20; i+=24)
{ {
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_24_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_24_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_24_SPARC64_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 24; worked_on += 24;
} }
#endif #endif
...@@ -230,7 +274,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -230,7 +274,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (nq-i == 10) if (nq-i == 10)
{ {
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_10_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_10_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_10_SPARC64_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 10; worked_on += 10;
} }
#endif #endif
...@@ -238,7 +287,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -238,7 +287,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i == 20) if (nq-i == 20)
{ {
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_20_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_20_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_20_SPARC64_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 20; worked_on += 20;
} }
#endif #endif
...@@ -246,7 +300,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -246,7 +300,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (nq-i == 8) if (nq-i == 8)
{ {
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_8_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_8_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_8_SPARC64_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 8; worked_on += 8;
} }
#endif #endif
...@@ -254,7 +313,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -254,7 +313,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i == 16) if (nq-i == 16)
{ {
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_16_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_16_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_16_SPARC64_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 16; worked_on += 16;
} }
#endif #endif
...@@ -263,7 +327,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -263,7 +327,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (nq-i == 6) if (nq-i == 6)
{ {
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_6_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_6_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_6_SPARC64_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 6; worked_on += 6;
} }
#endif #endif
...@@ -271,7 +340,13 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -271,7 +340,13 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i == 12) if (nq-i == 12)
{ {
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_12_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_12_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_12_SPARC64_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 12; worked_on += 12;
} }
#endif #endif
...@@ -279,7 +354,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -279,7 +354,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (nq-i == 4) if (nq-i == 4)
{ {
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_4_SPARC64_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 4; worked_on += 4;
} }
#endif #endif
...@@ -287,7 +367,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -287,7 +367,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i == 8) if (nq-i == 8)
{ {
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_8_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_8_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_8_SPARC64_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 8; worked_on += 8;
} }
#endif #endif
...@@ -295,7 +380,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -295,7 +380,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (nq-i == 2) if (nq-i == 2)
{ {
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_2_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_2_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_2_SPARC64_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 2; worked_on += 2;
} }
#endif #endif
...@@ -303,14 +393,26 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -303,14 +393,26 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i == 4) if (nq-i == 4)
{ {
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_4_SPARC64_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 4; worked_on += 4;
} }
#endif #endif
#ifdef WITH_DEBUG #ifdef WITH_DEBUG
if (worked_on != nq) if (worked_on != nq)
{ {
#ifdef HAVE_SSE_INTRINSICS
printf("Error in real SSE BLOCK2 kernel %d %d\n", worked_on, nq); printf("Error in real SSE BLOCK2 kernel %d %d\n", worked_on, nq);
#endif
#ifdef HAVE_SPARC64_SSE
printf("Error in real SPARC64 BLOCK2 kernel %d %d\n", worked_on, nq);
#endif
abort(); abort();
} }
#endif #endif
...@@ -327,12 +429,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -327,12 +429,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder * matrix Vector product with two householder
* vectors + a rank 2 update is performed * vectors + a rank 2 update is performed
*/ */
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s) __forceinline void hh_trafo_kernel_12_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_24_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s) __forceinline void hh_trafo_kernel_24_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif #endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_24_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
{ {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [12 x nb+1] * hh // Matrix Vector Multiplication, Q [12 x nb+1] * hh
...@@ -661,11 +773,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -661,11 +773,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder * matrix Vector product with two householder
* vectors + a rank 2 update is performed * vectors + a rank 2 update is performed
*/ */
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL