Commit c38695bd authored by Andreas Marek's avatar Andreas Marek
Browse files

Double precision real block6 kernel for Sparc64

parent 46ed16fd
......@@ -244,12 +244,12 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
#if WITH_REAL_SPARC64_BLOCK4_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
if WITH_REAL_SPARC64_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
#endif
#endif
endif
if WITH_REAL_VSX_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
......@@ -288,9 +288,9 @@ endif
if WITH_REAL_SPARC64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
endif
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
#endif
endif
if WITH_REAL_VSX_BLOCK6_KERNEL
......@@ -689,6 +689,9 @@ EXTRA_DIST = \
src/elpa2/kernels/real_vsx_2hv_template.c \
src/elpa2/kernels/real_vsx_4hv_template.c \
src/elpa2/kernels/real_vsx_6hv_template.c \
src/elpa2/kernels/real_sparc64_2hv_template.c \
src/elpa2/kernels/real_sparc64_4hv_template.c \
src/elpa2/kernels/real_sparc64_6hv_template.c \
src/elpa2/kernels/real_sse_2hv_template.c \
src/elpa2/kernels/real_sse_4hv_template.c \
src/elpa2/kernels/real_sse_6hv_template.c \
......
......@@ -470,8 +470,6 @@ m4_define(elpa_m4_sparc64_kernels, [
real_sparc64_block2
real_sparc64_block4
real_sparc64_block6
complex_sparc64_block1
complex_sparc64_block2
])
m4_define(elpa_m4_vsx_kernels, [
......
......@@ -72,9 +72,7 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1, 15, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2, 16, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
......@@ -628,32 +628,32 @@
! sparc64 block1 complex kernel
#if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!#ifndef WITH_FIXED_COMPLEX_KERNEL
! if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
! ttt = mpi_wtime()
! do j = ncols, 1, -1
!#ifdef WITH_OPENMP
! call single_hh_trafo_&
! &MATH_DATATYPE&
! &_sparc64_1hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#else
! call single_hh_trafo_&
! &MATH_DATATYPE&
! &_sparc64_1hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
! enddo
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
! endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */
......@@ -918,45 +918,45 @@
! implementation of sparc64 block 2 complex case
#if defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!#ifndef WITH_FIXED_COMPLEX_KERNEL
! if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
! ttt = mpi_wtime()
! do j = ncols, 2, -2
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
!#ifdef WITH_OPENMP
! call double_hh_trafo_&
! &MATH_DATATYPE&
! &_sparc64_2hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
!#else
! call double_hh_trafo_&
! &MATH_DATATYPE&
! &_sparc64_2hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
!#endif
! enddo
!#ifdef WITH_OPENMP
! if (j==1) call single_hh_trafo_&
! &MATH_DATATYPE&
! &_sparc64_1hv_&
! &PRECISION&
! & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#else
! if (j==1) call single_hh_trafo_&
! &MATH_DATATYPE&
! &_sparc64_1hv_&
! &PRECISION&
! & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#endif
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
! endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */
......
......@@ -192,6 +192,14 @@
write(error_unit,*) "The GENERIC kernel will be used at the moment"
kernel = ELPA_2STAGE_REAL_GENERIC
endif
! special case at the moment NO single precision kernels on SPARC64 -> set GENERIC for now
if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6 ) then
write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for SPARC64"
write(error_unit,*) "The GENERIC kernel will be used at the moment"
kernel = ELPA_2STAGE_REAL_GENERIC
endif
#endif
#endif
......
......@@ -44,6 +44,7 @@
//
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
//
#include "config-f90.h"
#ifdef HAVE_SSE_INTRINSICS
#include <x86intrin.h>
......@@ -82,6 +83,7 @@
#undef __AVX__
#endif
#ifdef HAVE_SSE_INTRINSICS
//Forward declaration
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
......@@ -99,14 +101,46 @@ __forceinline void hh_trafo_kernel_16_SSE_2hv_single(float* q, float* hh, int nb
__forceinline void hh_trafo_kernel_20_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_24_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
//Forward declaration
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_4_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_6_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_10_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_12_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_8_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_12_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_16_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_20_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_24_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
#endif
#endif
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
void double_hh_trafo_real_sse_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
void double_hh_trafo_real_sse_2hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
void double_hh_trafo_real_sparc64_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
void double_hh_trafo_real_sparc64_2hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#endif
/*
!f>#ifdef HAVE_SPARC64_SSE
......@@ -210,14 +244,24 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL
for (i = 0; i < nq-10; i+=12)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_12_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_12_SPARC64_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 12;
}
#endif
#ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-20; i+=24)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_24_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_24_SPARC64_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 24;
}
#endif
......@@ -230,7 +274,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 10)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_10_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_10_SPARC64_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 10;
}
#endif
......@@ -238,7 +287,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 20)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_20_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_20_SPARC64_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 20;
}
#endif
......@@ -246,7 +300,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 8)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_8_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_8_SPARC64_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 8;
}
#endif
......@@ -254,7 +313,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 16)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_16_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_16_SPARC64_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 16;
}
#endif
......@@ -263,7 +327,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 6)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_6_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_6_SPARC64_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 6;
}
#endif
......@@ -271,7 +340,13 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 12)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_12_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_12_SPARC64_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 12;
}
#endif
......@@ -279,7 +354,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 4)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_4_SPARC64_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 4;
}
#endif
......@@ -287,7 +367,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 8)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_8_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_8_SPARC64_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 8;
}
#endif
......@@ -295,7 +380,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 2)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_2_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_2_SPARC64_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 2;
}
#endif
......@@ -303,14 +393,26 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 4)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_4_SPARC64_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
worked_on += 4;
}
#endif
#ifdef WITH_DEBUG
if (worked_on != nq)
{
#ifdef HAVE_SSE_INTRINSICS
printf("Error in real SSE BLOCK2 kernel %d %d\n", worked_on, nq);
#endif
#ifdef HAVE_SPARC64_SSE
printf("Error in real SPARC64 BLOCK2 kernel %d %d\n", worked_on, nq);
#endif
abort();
}
#endif
......@@ -327,12 +429,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_24_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_24_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [12 x nb+1] * hh
......@@ -661,11 +773,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_10_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_20_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_10_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_20_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
{
/////////////////////////////////////////////////////
......@@ -973,11 +1096,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_16_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_16_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
{
/////////////////////////////////////////////////////
......@@ -1262,11 +1396,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_6_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_6_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
{
/////////////////////////////////////////////////////
......@@ -1531,11 +1676,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
{
/////////////////////////////////////////////////////
......@@ -1775,11 +1931,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_SPARC64_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SPARC64_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
#endif
{
/////////////////////////////////////////////////////
......
......@@ -759,7 +759,7 @@ __forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, in
h1 = tau1;
x1 = _SSE_MUL(x1, h1);
x2 = _SSE_MUL(x2, h1);
x3 = _SSE_MUL(x3, h1)
x3 = _SSE_MUL(x3, h1);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
......@@ -944,7 +944,7 @@ __forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, in
#endif
#endif
#ifdef HAVE_SPARC64_INTRINSICS
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h3 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]);
#endif
......@@ -1164,7 +1164,7 @@ __forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, in
q2 = _SSE_SUB(q2, _SSE_MUL(x2, h1));
q3 = _SSE_SUB(q3, _SSE_MUL(x3, h1));
#ifdef HAVE_SSE_INTRINSCS
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_set1_pd(hh[ldh+nb-2]);