//#define SSECODE //Explicit SSE code, not correct yet since loop counter is assumed multiple of 4, anyway not faster than autovectorized code.
//#define SSECODE //Explicit SSE code, not correct yet since loop counter is assumed multiple of 4, anyway not faster than autovectorized code, only implemented for float, not for double.
#endif
#ifdef SSECODE
...
...
@@ -156,7 +156,7 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
if(myShift<32)//Warp Size is 32, threads are synched automatically
{
volatilefloat*vbuf=buf;//Mem must be volatile such that memory access is not reordered
volatilemyfloat_t*vbuf=buf;//Mem must be volatile such that memory access is not reordered