Commit b233069a authored by Andreas Marek's avatar Andreas Marek
Browse files

Smaller step sizes in real SSE kernels

parent 35e63340
...@@ -164,16 +164,17 @@ void double_hh_trafo_real_sse_2hv_single(float* q, float* hh, int* pnb, int* pnq ...@@ -164,16 +164,17 @@ void double_hh_trafo_real_sse_2hv_single(float* q, float* hh, int* pnb, int* pnq
for (i = 0; i < nq-10; i+=12) for (i = 0; i < nq-10; i+=12)
{ {
hh_trafo_kernel_12_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_12_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += i; worked_on += 12;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-20; i+=24) for (i = 0; i < nq-20; i+=24)
{ {
hh_trafo_kernel_20_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_24_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += i; worked_on += 24;
} }
#endif #endif
if (nq == i) if (nq == i)
{ {
return; return;
...@@ -262,8 +263,8 @@ void double_hh_trafo_real_sse_2hv_single(float* q, float* hh, int* pnb, int* pnq ...@@ -262,8 +263,8 @@ void double_hh_trafo_real_sse_2hv_single(float* q, float* hh, int* pnb, int* pnq
if (worked_on != nq) if (worked_on != nq)
{ {
// printf("Error in real SSE BLOCK2 kernel \n"); printf("Error in real SSE BLOCK2 kernel %d %d\n", worked_on, nq);
// abort(); abort();
} }
} }
......
...@@ -62,6 +62,7 @@ ...@@ -62,6 +62,7 @@
#include "config-f90.h" #include "config-f90.h"
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
#define offset 2 #define offset 2
#define __SSE_DATATYPE __m128d #define __SSE_DATATYPE __m128d
...@@ -82,6 +83,8 @@ ...@@ -82,6 +83,8 @@
#endif #endif
#include <x86intrin.h> #include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
...@@ -149,6 +152,10 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, ...@@ -149,6 +152,10 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq,
int nq = *pldq; int nq = *pldq;
int ldq = *pldq; int ldq = *pldq;
int ldh = *pldh; int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar products to compute // calculating scalar products to compute
// 4 householder vectors simultaneously // 4 householder vectors simultaneously
...@@ -200,12 +207,14 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, ...@@ -200,12 +207,14 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq,
for (i = 0; i < nq-4; i+=6) for (i = 0; i < nq-4; i+=6)
{ {
hh_trafo_kernel_6_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_6_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 6;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-8; i+=12) for (i = 0; i < nq-8; i+=12)
{ {
hh_trafo_kernel_12_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_12_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 12;
} }
#endif #endif
...@@ -213,28 +222,44 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, ...@@ -213,28 +222,44 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq,
{ {
return; return;
} }
#ifdef DOUBLE_PRECISION_REAL
if (nq-i ==4)
{
hh_trafo_kernel_4_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 4;
}
#endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i ==8) if (nq-i ==8)
{ {
hh_trafo_kernel_8_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_8_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 8;
} }
#endif #endif
else
{
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (nq-i > 2) if (nq-i == 2)
{ {
hh_trafo_kernel_4_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_2_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
} worked_on += 2;
else }
{
hh_trafo_kernel_2_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_4_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); if (nq-i ==4)
{
hh_trafo_kernel_4_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 4;
}
#endif #endif
if (worked_on != nq)
{
printf("Error in real SSE BLOCK4 kernel \n");
abort();
} }
} }
/** /**
......
...@@ -62,6 +62,8 @@ ...@@ -62,6 +62,8 @@
#include "config-f90.h" #include "config-f90.h"
#include <x86intrin.h> #include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
...@@ -145,6 +147,9 @@ void hexa_hh_trafo_real_sse_6hv_single(float* q, float* hh, int* pnb, int* pnq, ...@@ -145,6 +147,9 @@ void hexa_hh_trafo_real_sse_6hv_single(float* q, float* hh, int* pnb, int* pnq,
int nq = *pldq; int nq = *pldq;
int ldq = *pldq; int ldq = *pldq;
int ldh = *pldh; int ldh = *pldh;
int worked_on ;
worked_on = 0;
// calculating scalar products to compute // calculating scalar products to compute
// 6 householder vectors simultaneously // 6 householder vectors simultaneously
...@@ -255,26 +260,38 @@ void hexa_hh_trafo_real_sse_6hv_single(float* q, float* hh, int* pnb, int* pnq, ...@@ -255,26 +260,38 @@ void hexa_hh_trafo_real_sse_6hv_single(float* q, float* hh, int* pnb, int* pnq,
for (i = 0; i < nq-2; i+=4) for (i = 0; i < nq-2; i+=4)
{ {
hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 4;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-4; i+=8) for (i = 0; i < nq-4; i+=8)
{ {
hh_trafo_kernel_8_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_8_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 8;
} }
#endif #endif
if (nq == i) if (nq == i)
{ {
return; return;
} }
else
{
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (nq -i == 2)
{
hh_trafo_kernel_2_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_2_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 2;
}
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq -i == 4)
{
hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 4;
}
#endif #endif
if (worked_on != nq)
{
printf("Error in real SSE BLOCK6 kernel \n");
abort();
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment