Commit 567fc568 authored by Andreas Marek's avatar Andreas Marek
Browse files

Correct errors in VSX block2 kernel

parent afa7e5e3
......@@ -251,12 +251,12 @@ endif
#endif
#endif
#
if WITH_REAL_VSX_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_single_precision.c
endif
endif
#if WITH_REAL_VSX_BLOCK4_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_single_precision.c
#endif
#endif
if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_4hv_double_precision.c
......
......@@ -623,8 +623,8 @@ for cc, fc, m, o, p, a, b, g, cov, instr, addr, na in product(
print(" - export SRUN_COMMANDLINE_CONFIGURE=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$CONFIGURETIME --constraint=$CONTSTRAINTS --mem=$REQUESTED_MEMORY\" ")
print(" - export SRUN_COMMANDLINE_BUILD=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$BUILDTIME --constraint=$CONTSTRAINTS --mem=$REQUESTED_MEMORY \" ")
print(" - export SRUN_COMMANDLINE_RUN=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$RUNTIME --constraint=$CONTSTRAINTS --mem=$REQUESTED_MEMORY \" ")
print(" - echo \"srun --ntasks=1 --cpus-per-task=1 $SRUN_COMMANDLINE_CONFIGURE\" ")
print(" - srun --threads-per-core=1 --ntasks-per-core=1 --ntasks=1 --cpus-per-task=1 $SRUN_COMMANDLINE_CONFIGURE" \
print(" - echo \"srun --cpu_bind=cores --hint=nomultithread --ntasks=1 --cpus-per-task=1 $SRUN_COMMANDLINE_CONFIGURE\" ")
print(" - srun --cpu_bind=cores --hint=nomultithread --threads-per-core=1 --ntasks-per-core=1 --ntasks=1 --cpus-per-task=1 $SRUN_COMMANDLINE_CONFIGURE" \
+ " /scratch/elpa/bin/configure_elpa.sh" \
+ " \" CC=\\\""+c_compiler_wrapper+"\\\"" + " CFLAGS=\\\""+CFLAGS+"\\\"" \
+ " FC=\\\""+fortran_compiler_wrapper+"\\\"" + " FCFLAGS=\\\""+FCFLAGS+"\\\"" \
......@@ -638,8 +638,8 @@ for cc, fc, m, o, p, a, b, g, cov, instr, addr, na in product(
if ( instr == "sse" or (instr == "avx" and g != "with-gpu")):
print(" - make -j 8")
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu"):
print(" - echo \"srun --ntasks=1 --cpus-per-task=8 $SRUN_COMMANDLINE_BUILD\" ")
print(" - srun --threads-per-core=1 --ntasks-per-core=1 --ntasks=1 --cpus-per-task=8 $SRUN_COMMANDLINE_BUILD /scratch/elpa/bin/build_elpa.sh")
print(" - echo \"srun --cpu_bind=cores --hint=nomultithread --ntasks=1 --cpus-per-task=8 $SRUN_COMMANDLINE_BUILD\" ")
print(" - srun --cpu_bind=cores --hint=nomultithread --threads-per-core=1 --ntasks-per-core=1 --ntasks=1 --cpus-per-task=8 $SRUN_COMMANDLINE_BUILD /scratch/elpa/bin/build_elpa.sh")
# do the test
if ( instr == "sse" or (instr == "avx" and g != "with-gpu")):
......@@ -662,8 +662,8 @@ for cc, fc, m, o, p, a, b, g, cov, instr, addr, na in product(
openmp_threads=" 1 "
for na in sorted(matrix_size.keys(),reverse=True):
cores = set_number_of_cores(MPI_TASKS, o)
print(" - echo \" srun --ntasks=1 --cpus-per-task="+str(cores)+" $SRUN_COMMANDLINE_RUN\" ")
print(" - srun --threads-per-core=1 --ntasks-per-core=1 --ntasks=1 --cpus-per-task="+str(cores)+" $SRUN_COMMANDLINE_RUN \
print(" - echo \" srun --cpu_bind=cores --hint=nomultithread --ntasks=1 --cpus-per-task="+str(cores)+" $SRUN_COMMANDLINE_RUN\" ")
print(" - srun --cpu_bind=cores --hint=nomultithread --threads-per-core=1 --ntasks-per-core=1 --ntasks=1 --cpus-per-task="+str(cores)+" $SRUN_COMMANDLINE_RUN \
/scratch/elpa/bin/run_elpa.sh "+str(MPI_TASKS) + openmp_threads +" \" TEST_FLAGS=\\\""+ matrix_size[na] + " "+ str(nev)+" "+str(nblk)+"\\\" || { cat test-suite.log; exit 1; }\"")
if (cov == "coverage"):
......
......@@ -55,7 +55,7 @@
#define __forceinline __attribute__((always_inline)) static
#ifdef DOUBLE_PRECISION_REAL
#define __SSE_DATATYPE __vector double
#define _SSE_LOAD vec_ld
#define _SSE_LOAD (__vector double) vec_ld
#define _SSE_ADD vec_add
#define _SSE_MUL vec_mul
#define _SSE_NEG vec_neg
......@@ -65,7 +65,7 @@
#ifdef SINGLE_PRECISION_REAL
#define __SSE_DATATYPE __vector float
#define _SSE_LOAD vec_ld
#define _SSE_LOAD (__vector float) vec_ld
#define _SSE_ADD vec_add
#define _SSE_MUL vec_mul
#define _SSE_NEG vec_neg
......@@ -298,6 +298,13 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
#ifdef DOUBLE_PRECISION_REAL
double mone = 1.0;
#endif
#ifdef SINGLE_PRECISION_REAL
float mone = 1.0;
#endif
#ifdef HAVE_SSE_INTRINSICS
// Needed bit mask for floating point sign flip
#ifdef DOUBLE_PRECISION_REAL
......@@ -308,12 +315,12 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
#endif
__SSE_DATATYPE x1 = _SSE_LOAD(0 ,&q[ldq]);
__SSE_DATATYPE x2 = _SSE_LOAD(0, &q[ldq+offset]);
__SSE_DATATYPE x3 = _SSE_LOAD(0, &q[ldq+2*offset]);
__SSE_DATATYPE x4 = _SSE_LOAD(0, &q[ldq+3*offset]);
__SSE_DATATYPE x5 = _SSE_LOAD(0, &q[ldq+4*offset]);
__SSE_DATATYPE x6 = _SSE_LOAD(0, &q[ldq+5*offset]);
__SSE_DATATYPE x1 = _SSE_LOAD(0, (unsigned long int *) &q[ldq]);
__SSE_DATATYPE x2 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+offset]);
__SSE_DATATYPE x3 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+2*offset]);
__SSE_DATATYPE x4 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+3*offset]);
__SSE_DATATYPE x5 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+4*offset]);
__SSE_DATATYPE x6 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+5*offset]);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
......@@ -334,17 +341,17 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
__SSE_DATATYPE h2;
__SSE_DATATYPE q1 = _SSE_LOAD(0, q);
__SSE_DATATYPE q1 = _SSE_LOAD(0, (unsigned long int *) &q[0]);
__SSE_DATATYPE y1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
__SSE_DATATYPE q2 = _SSE_LOAD(0, &q[offset]);
__SSE_DATATYPE q2 = _SSE_LOAD(0, (unsigned long int *) &q[offset]);
__SSE_DATATYPE y2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
__SSE_DATATYPE q3 = _SSE_LOAD(0, &q[2*offset]);
__SSE_DATATYPE q3 = _SSE_LOAD(0, (unsigned long int *) &q[2*offset]);
__SSE_DATATYPE y3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
__SSE_DATATYPE q4 = _SSE_LOAD(0, &q[3*offset]);
__SSE_DATATYPE q4 = _SSE_LOAD(0, (unsigned long int *) &q[3*offset]);
__SSE_DATATYPE y4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
__SSE_DATATYPE q5 = _SSE_LOAD(0, &q[4*offset]);
__SSE_DATATYPE q5 = _SSE_LOAD(0, (unsigned long int *) &q[4*offset]);
__SSE_DATATYPE y5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
__SSE_DATATYPE q6 = _SSE_LOAD(0, &q[5*offset]);
__SSE_DATATYPE q6 = _SSE_LOAD(0, (unsigned long int *) &q[5*offset]);
__SSE_DATATYPE y6 = _SSE_ADD(q6, _SSE_MUL(x6, h1));
for(i = 2; i < nb; i++)
{
......@@ -369,22 +376,22 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
#endif
q1 = _SSE_LOAD(0, &q[i*ldq]);
q1 = _SSE_LOAD(0, (unsigned long int *) &q[i*ldq]);
x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
q2 = _SSE_LOAD(0, &q[(i*ldq)+offset]);
q2 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+offset]);
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
q3 = _SSE_LOAD(0, &q[(i*ldq)+2*offset]);
q3 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+2*offset]);
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));
q4 = _SSE_LOAD(0, &q[(i*ldq)+3*offset]);
q4 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+3*offset]);
x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
y4 = _SSE_ADD(y4, _SSE_MUL(q4,h2));
q5 = _SSE_LOAD(0, &q[(i*ldq)+4*offset]);
q5 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+4*offset]);
x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
y5 = _SSE_ADD(y5, _SSE_MUL(q5,h2));
q6 = _SSE_LOAD(0, &q[(i*ldq)+5*offset]);
q6 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+5*offset]);
x6 = _SSE_ADD(x6, _SSE_MUL(q6,h1));
y6 = _SSE_ADD(y6, _SSE_MUL(q6,h2));
}
......@@ -401,21 +408,21 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
h1 = vec_splats(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = vec_spalts(hh[nb-1]);
h1 = vec_splats(hh[nb-1]);
#endif
#endif
q1 = _SSE_LOAD(0,&q[nb*ldq]);
q1 = _SSE_LOAD(0, (unsigned long int *) &q[nb*ldq]);
x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
q2 = _SSE_LOAD(0, &q[(nb*ldq)+offset]);
q2 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+offset]);
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
q3 = _SSE_LOAD(0, &q[(nb*ldq)+2*offset]);
q3 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+2*offset]);
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
q4 = _SSE_LOAD(0, &q[(nb*ldq)+3*offset]);
q4 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+3*offset]);
x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
q5 = _SSE_LOAD(0, &q[(nb*ldq)+4*offset]);
q5 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+4*offset]);
x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
q6 = _SSE_LOAD(0, &q[(nb*ldq)+5*offset]);
q6 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+5*offset]);
x6 = _SSE_ADD(x6, _SSE_MUL(q6,h1));
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
......@@ -441,7 +448,7 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = vec_splats(hh[0]);
__SSE_DATATYPE tau2 = vec_splats(hh[ldh]);
__SSE_DATATYPE vs = vec_splatss(s);
__SSE_DATATYPE vs = vec_splats(s);
#endif
#endif
......@@ -449,7 +456,8 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
h1 = _SSE_XOR(tau1, sign);
#endif
#ifdef HAVE_VSX_SSE
h1 = vec_neg(tau1);
//h1 = vec_neg(tau1);
h1 = vec_mul(vec_splats(mone), tau1);
#endif
x1 = _SSE_MUL(x1, h1);
x2 = _SSE_MUL(x2, h1);
......@@ -461,7 +469,8 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
h1 = _SSE_XOR(tau2, sign);
#endif
#ifdef HAVE_SPARC64_SSE
h1 = vec_neg(tau2);
//h1 = vec_neg(tau2);
h1 = vec_mul(vec_splats(mone), tau2);
#endif
h2 = _SSE_MUL(h1, vs);
......@@ -471,24 +480,24 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
y4 = _SSE_ADD(_SSE_MUL(y4,h1), _SSE_MUL(x4,h2));
y5 = _SSE_ADD(_SSE_MUL(y5,h1), _SSE_MUL(x5,h2));
y6 = _SSE_ADD(_SSE_MUL(y6,h1), _SSE_MUL(x6,h2));
q1 = _SSE_LOAD(0, q);
q1 = _SSE_LOAD(0, (unsigned long int *) &q[0]);
q1 = _SSE_ADD(q1, y1);
_SSE_STORE(q1, 0, q);
q2 = _SSE_LOAD(0,&q[offset]);
_SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *) &q[0]);
q2 = _SSE_LOAD(0, (unsigned long int *) &q[offset]);
q2 = _SSE_ADD(q2, y2);
_SSE_STORE(q2, 0,&q[offset]);
q3 = _SSE_LOAD(0, &q[2*offset]);
_SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *) &q[offset]);
q3 = _SSE_LOAD(0, (unsigned long int *) &q[2*offset]);
q3 = _SSE_ADD(q3, y3);
_SSE_STORE(q3, 0, &q[2*offset]);
q4 = _SSE_LOAD(0, &q[3*offset]);
_SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *) &q[2*offset]);
q4 = _SSE_LOAD(0, (unsigned long int *) &q[3*offset]);
q4 = _SSE_ADD(q4, y4);
_SSE_STORE(q4, 0, &q[3*offset]);
q5 = _SSE_LOAD(0, &q[4*offset]);
_SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *) &q[3*offset]);
q5 = _SSE_LOAD(0, (unsigned long int *) &q[4*offset]);
q5 = _SSE_ADD(q5, y5);
_SSE_STORE(q5, 0, &q[4*offset]);
q6 = _SSE_LOAD(0, &q[5*offset]);
_SSE_STORE((__vector unsigned int) q5, 0, (unsigned int *) &q[4*offset]);
q6 = _SSE_LOAD(0, (unsigned long int *) &q[5*offset]);
q6 = _SSE_ADD(q6, y6);
_SSE_STORE(q6, 0, &q[5*offset]);
_SSE_STORE((__vector unsigned int) q6, 0, (unsigned int *) &q[5*offset]);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
......@@ -508,24 +517,24 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
#endif
q1 = _SSE_LOAD(0, &q[ldq]);
q1 = _SSE_LOAD(0, (unsigned long int *) &q[ldq]);
q1 = _SSE_ADD(q1, _SSE_ADD(x1, _SSE_MUL(y1, h2)));
_SSE_STORE(q1, 0, &q[ldq]);
q2 = _SSE_LOAD(0, &q[ldq+offset]);
_SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *) &q[ldq]);
q2 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+offset]);
q2 = _SSE_ADD(q2, _SSE_ADD(x2, _SSE_MUL(y2, h2)));
_SSE_STORE(q2, 0, &q[ldq+offset]);
q3 = _SSE_LOAD(0, &q[ldq+2*offset]);
_SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *) &q[ldq+offset]);
q3 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+2*offset]);
q3 = _SSE_ADD(q3, _SSE_ADD(x3, _SSE_MUL(y3, h2)));
_SSE_STORE(q3, 0, &q[ldq+2*offset]);
q4 = _SSE_LOAD(0, &q[ldq+3*offset]);
_SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *) &q[ldq+2*offset]);
q4 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+3*offset]);
q4 = _SSE_ADD(q4, _SSE_ADD(x4, _SSE_MUL(y4, h2)));
_SSE_STORE(q4, 0, &q[ldq+3*offset]);
q5 = _SSE_LOAD(0, &q[ldq+4*offset]);
_SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *) &q[ldq+3*offset]);
q5 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+4*offset]);
q5 = _SSE_ADD(q5, _SSE_ADD(x5, _SSE_MUL(y5, h2)));
_SSE_STORE(q5, 0, &q[ldq+4*offset]);
q6 = _SSE_LOAD(0, &q[ldq+5*offset]);
_SSE_STORE((__vector unsigned int) q5, 0, (unsigned int *) &q[ldq+4*offset]);
q6 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+5*offset]);
q6 = _SSE_ADD(q6, _SSE_ADD(x6, _SSE_MUL(y6, h2)));
_SSE_STORE(q6, 0, &q[ldq+5*offset]);
_SSE_STORE((__vector unsigned int) q6, 0, (unsigned int *) &q[ldq+5*offset]);
for (i = 2; i < nb; i++)
{
......@@ -550,24 +559,24 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
#endif
q1 = _SSE_LOAD(0, &q[i*ldq]);
q1 = _SSE_LOAD(0, (unsigned long int *) &q[i*ldq]);
q1 = _SSE_ADD(q1, _SSE_ADD(_SSE_MUL(x1,h1), _SSE_MUL(y1, h2)));
_SSE_STORE(q1, &q[i*ldq]);
q2 = _SSE_LOAD(0, &q[(i*ldq)+offset]);
_SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *) &q[i*ldq]);
q2 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+offset]);
q2 = _SSE_ADD(q2, _SSE_ADD(_SSE_MUL(x2,h1), _SSE_MUL(y2, h2)));
_SSE_STORE(q2, 0, &q[(i*ldq)+offset]);
q3 = _SSE_LOAD(0, &q[(i*ldq)+2*offset]);
_SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *) &q[(i*ldq)+offset]);
q3 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+2*offset]);
q3 = _SSE_ADD(q3, _SSE_ADD(_SSE_MUL(x3,h1), _SSE_MUL(y3, h2)));
_SSE_STORE(q3, 0, &q[(i*ldq)+2*offset]);
q4 = _SSE_LOAD(0, &q[(i*ldq)+3*offset]);
_SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *) &q[(i*ldq)+2*offset]);
q4 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+3*offset]);
q4 = _SSE_ADD(q4, _SSE_ADD(_SSE_MUL(x4,h1), _SSE_MUL(y4, h2)));
_SSE_STORE(q4, 0, &q[(i*ldq)+3*offset]);
q5 = _SSE_LOAD(0, &q[(i*ldq)+4*offset]);
_SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *) &q[(i*ldq)+3*offset]);
q5 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+4*offset]);
q5 = _SSE_ADD(q5, _SSE_ADD(_SSE_MUL(x5,h1), _SSE_MUL(y5, h2)));
_SSE_STORE(q5, 0, &q[(i*ldq)+4*offset]);
q6 = _SSE_LOAD(0, &q[(i*ldq)+5*offset]);
_SSE_STORE((__vector unsigned int) q5, 0, (unsigned int *) &q[(i*ldq)+4*offset]);
q6 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+5*offset]);
q6 = _SSE_ADD(q6, _SSE_ADD(_SSE_MUL(x6,h1), _SSE_MUL(y6, h2)));
_SSE_STORE(q6, 0, &q[(i*ldq)+5*offset]);
_SSE_STORE((__vector unsigned int) q6, 0, (unsigned int *) &q[(i*ldq)+5*offset]);
}
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
......@@ -587,24 +596,24 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
q1 = _SSE_LOAD(0, &q[nb*ldq]);
q1 = _SSE_LOAD(0, (unsigned long int *) &q[nb*ldq]);
q1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
_SSE_STORE(q1, 0, &q[nb*ldq]);
q2 = _SSE_LOAD(0, &q[(nb*ldq)+offset]);
_SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *) &q[nb*ldq]);
q2 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+offset]);
q2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
_SSE_STORE(q2, 0, &q[(nb*ldq)+offset]);
q3 = _SSE_LOAD(0, &q[(nb*ldq)+2*offset]);
_SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *) &q[(nb*ldq)+offset]);
q3 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+2*offset]);
q3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
_SSE_STORE(q3, 0, &q[(nb*ldq)+2*offset]);
q4 = _SSE_LOAD(0, &q[(nb*ldq)+3*offset]);
_SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *) &q[(nb*ldq)+2*offset]);
q4 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+3*offset]);
q4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
_SSE_STORE(q4, 0, &q[(nb*ldq)+3*offset]);
q5 = _SSE_LOAD(0, &q[(nb*ldq)+4*offset]);
_SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *) &q[(nb*ldq)+3*offset]);
q5 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+4*offset]);
q5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
_SSE_STORE(q5, 0, &q[(nb*ldq)+4*offset]);
q6 = _SSE_LOAD(0, &q[(nb*ldq)+5*offset]);
_SSE_STORE((__vector unsigned int) q5, 0, (unsigned int *) &q[(nb*ldq)+4*offset]);
q6 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+5*offset]);
q6 = _SSE_ADD(q6, _SSE_MUL(x6, h1));
_SSE_STORE(q6, 0, &q[(nb*ldq)+5*offset]);
_SSE_STORE((__vector unsigned int) q6, 0, (unsigned int *) &q[(nb*ldq)+5*offset]);
}
......@@ -632,6 +641,13 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
#ifdef DOUBLE_PRECISION_REAL
double mone = 1.0;
#endif
#ifdef SINGLE_PRECISION_REAL
float mone = 1.0;
#endif
#ifdef HAVE_SSE_INTRINSICS
// Needed bit mask for floating point sign flip
#ifdef DOUBLE_PRECISION_REAL
......@@ -642,11 +658,11 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
#endif
__SSE_DATATYPE x1 = _SSE_LOAD(0, &q[ldq]);
__SSE_DATATYPE x2 = _SSE_LOAD(0, &q[ldq+offset]);
__SSE_DATATYPE x3 = _SSE_LOAD(0, &q[ldq+2*offset]);
__SSE_DATATYPE x4 = _SSE_LOAD(0, &q[ldq+3*offset]);
__SSE_DATATYPE x5 = _SSE_LOAD(0, &q[ldq+4*offset]);
__SSE_DATATYPE x1 = _SSE_LOAD(0, (unsigned long int *) &q[ldq]);
__SSE_DATATYPE x2 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+offset]);
__SSE_DATATYPE x3 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+2*offset]);
__SSE_DATATYPE x4 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+3*offset]);
__SSE_DATATYPE x5 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+4*offset]);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
......@@ -667,15 +683,15 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
__SSE_DATATYPE h2;
__SSE_DATATYPE q1 = _SSE_LOAD(0, q);
__SSE_DATATYPE q1 = _SSE_LOAD(0, (unsigned long int *) &q[0]);
__SSE_DATATYPE y1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
__SSE_DATATYPE q2 = _SSE_LOAD(0, &q[offset]);
__SSE_DATATYPE q2 = _SSE_LOAD(0, (unsigned long int *) &q[offset]);
__SSE_DATATYPE y2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
__SSE_DATATYPE q3 = _SSE_LOAD(0, &q[2*offset]);
__SSE_DATATYPE q3 = _SSE_LOAD(0, (unsigned long int *) &q[2*offset]);
__SSE_DATATYPE y3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
__SSE_DATATYPE q4 = _SSE_LOAD(0, &q[3*offset]);
__SSE_DATATYPE q4 = _SSE_LOAD(0, (unsigned long int *) &q[3*offset]);
__SSE_DATATYPE y4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
__SSE_DATATYPE q5 = _SSE_LOAD(0, &q[4*offset]);
__SSE_DATATYPE q5 = _SSE_LOAD(0, (unsigned long int *) &q[4*offset]);
__SSE_DATATYPE y5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
for(i = 2; i < nb; i++)
{
......@@ -701,19 +717,19 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
q1 = _SSE_LOAD(0, &q[i*ldq]);
q1 = _SSE_LOAD(0, (unsigned long int *) &q[i*ldq]);
x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
q2 = _SSE_LOAD(0, &q[(i*ldq)+offset]);
q2 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+offset]);
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
q3 = _SSE_LOAD(0, &q[(i*ldq)+2*offset]);
q3 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+2*offset]);
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));
q4 = _SSE_LOAD(0, &q[(i*ldq)+3*offset]);
q4 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+3*offset]);
x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
y4 = _SSE_ADD(y4, _SSE_MUL(q4,h2));
q5 = _SSE_LOAD(0, &q[(i*ldq)+4*offset]);
q5 = _SSE_LOAD(0, (unsigned long int *) &q[(i*ldq)+4*offset]);
x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
y5 = _SSE_ADD(y5, _SSE_MUL(q5,h2));
}
......@@ -731,19 +747,19 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
h1 = vec_splats(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = vex_splats(hh[nb-1]);
h1 = vec_splats(hh[nb-1]);
#endif
#endif
q1 = _SSE_LOAD(0, &q[nb*ldq]);
q1 = _SSE_LOAD(0, (unsigned long int *) &q[nb*ldq]);
x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
q2 = _SSE_LOAD(0, &q[(nb*ldq)+offset]);
q2 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+offset]);
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
q3 = _SSE_LOAD(0, &q[(nb*ldq)+2*offset]);
q3 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+2*offset]);
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
q4 = _SSE_LOAD(0, &q[(nb*ldq)+3*offset]);
q4 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+3*offset]);
x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
q5 = _SSE_LOAD(0, &q[(nb*ldq)+4*offset]);
q5 = _SSE_LOAD(0, (unsigned long int *) &q[(nb*ldq)+4*offset]);
x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
......@@ -768,7 +784,7 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
__SSE_DATATYPE vs = vec_splats(s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = vec_splats(hh[0], hh[0]);
__SSE_DATATYPE tau1 = vec_splats(hh[0]);
__SSE_DATATYPE tau2 = vec_splats(hh[ldh]);
__SSE_DATATYPE vs = vec_splats(s);
......@@ -779,7 +795,8 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
h1 = _SSE_XOR(tau1, sign);
#endif
#ifdef HAVE_VSX_SSE
h1 = vec_neg(tau1);
h1 = vec_mul(vec_splats(mone), tau1);
// h1 = vec_neg(tau1);
#endif
x1 = _SSE_MUL(x1, h1);
x2 = _SSE_MUL(x2, h1);
......@@ -790,7 +807,8 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
h1 = _SSE_XOR(tau2, sign);
#endif
#ifdef HAVE_VSX_SSE
h1 = vec_neg(tau2);
// h1 = vec_neg(tau2);
h1 = vec_mul(vec_splats(mone), tau2);
#endif
h2 = _SSE_MUL(h1, vs);
......@@ -799,21 +817,21 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
y3 = _SSE_ADD(_SSE_MUL(y3,h1), _SSE_MUL(x3,h2));
y4 = _SSE_ADD(_SSE_MUL(y4,h1), _SSE_MUL(x4,h2));
y5 = _SSE_ADD(_SSE_MUL(y5,h1), _SSE_MUL(x5,h2));
q1 = _SSE_LOAD(0, q);
q1 = _SSE_LOAD(0, (unsigned int *) &q[0]);
q1 = _SSE_ADD(q1, y1);
_SSE_STORE(q1, 0, q);
q2 = _SSE_LOAD(0, &q[offset]);
_SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *) &q[0]);
q2 = _SSE_LOAD(0, (unsigned long int *) &q[offset]);
q2 = _SSE_ADD(q2, y2);
_SSE_STORE(q2, 0, &q[offset]);
q3 = _SSE_LOAD(0,&q[2*offset]);
_SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *) &q[offset]);
q3 = _SSE_LOAD(0, (unsigned long int *) &q[2*offset]);
q3 = _SSE_ADD(q3, y3);
_SSE_STORE(q3,0,&q[2*offset]);
q4 = _SSE_LOAD(0, &q[3*offset]);
_SSE_STORE((__vector unsigned int) q3,0, (unsigned int *) &q[2*offset]);
q4 = _SSE_LOAD(0, (unsigned long int *) &q[3*offset]);
q4 = _SSE_ADD(q4, y4);
_SSE_STORE(q4, 0, &q[3*offset]);
q5 = _SSE_LOAD(0, &q[4*offset]);
_SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *) &q[3*offset]);
q5 = _SSE_LOAD(0, (unsigned long int *) &q[4*offset]);
q5 = _SSE_ADD(q5, y5);
_SSE_STORE(q5, 0, &q[4*offset]);
_SSE_STORE((__vector unsigned int) q5, 0, (unsigned int *) &q[4*offset]);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
......@@ -832,21 +850,21 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
#endif
q1 = _SSE_LOAD(0, &q[ldq]);
q1 = _SSE_LOAD(0, (unsigned long int *) &q[ldq]);
q1 = _SSE_ADD(q1, _SSE_ADD(x1, _SSE_MUL(y1, h2)));
_SSE_STORE(q1, 0, &q[ldq]);
q2 = _SSE_LOAD(0, &q[ldq+offset]);
_SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *) &q[ldq]);
q2 = _SSE_LOAD(0, (unsigned long int *) &q[ldq+offset]);
q2 = _SSE_ADD(q2, _SSE_ADD(x2, _SSE_MUL(y2, h2)));