Commit 4b034659 authored by Luka Stanisic's avatar Luka Stanisic

rel2: code development

parent 254d53db
......@@ -29,7 +29,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}")
include_directories(include)
set (BIOEM_ICC_FLAGS "-xHost -O3 -fno-alias -fno-fnalias -unroll -g0 -ipo")
set (BIOEM_ICC_FLAGS "-O3 -fno-alias -fno-fnalias -unroll -g0 -ip")
set (BIOEM_GCC_FLAGS "-O3 -march=native -fweb -mfpmath=sse -frename-registers -minline-all-stringops -ftracer -funroll-loops -fpeel-loops -fprefetch-loop-arrays -ffast-math -ggdb")
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
......@@ -50,11 +50,6 @@ if (NOT FFTW_FOUND)
endif()
include_directories(${FFTW_INCLUDE_DIRS})
find_package( Boost 1.43 REQUIRED COMPONENTS program_options )
include_directories( ${Boost_INCLUDE_DIRS} )
###Find Optional Packages
###Find CUDA
......@@ -163,7 +158,6 @@ if (FFTWF_LIBRARIES)
else()
target_link_libraries(bioEM -L${FFTW_LIBDIR} -lfftw3 -lfftw3f)
endif()
target_link_libraries(bioEM ${Boost_PROGRAM_OPTIONS_LIBRARY})
if (MPI_FOUND)
target_link_libraries(bioEM ${MPI_LIBRARIES})
......@@ -172,7 +166,6 @@ endif()
###Show Status
message(STATUS "Build Status")
message(STATUS "FFTW library: ${FFTW_LIBDIR}")
message(STATUS "Boost directory: ${Boost_LIBRARY_DIRS}")
message(STATUS "FFTW includedir: ${FFTW_INCLUDEDIR}")
message(STATUS "CUDA libraries: ${CUDA_CUDA_LIBRARY}")
message(STATUS "CUDART libraries: ${CUDA_LIBRARIES}")
......
/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
< BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2017 Pilar Cossio, Markus Rampp, Luka Stanisic and Gerhard
Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany.
Max Planck Computing and Data Facility, Garching, Germany.
Released under the GNU Public License, v3.
See license statement for terms of distribution.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
#include "autotuner.h"
void Autotuner::Reset()
......@@ -16,19 +28,22 @@ void Autotuner::Reset()
fb = 0.;
fx = 0.;
if (algo == 3) workload = 50;
if (algo == 3)
workload = 50;
}
bool Autotuner::Needed(int iteration)
{
if (stopTuning) return false;
if (stopTuning)
return false;
switch (algo)
{
case 1:
case 3:
return iteration % (stable + 1) == stable;
case 2: return (iteration == (int) stable / 2 ) || (iteration == stable);
case 2:
return (iteration == (int) stable / 2) || (iteration == stable);
default: /* Should never happen */;
}
return false;
......@@ -46,10 +61,12 @@ bool Autotuner::Finished()
}
break;
case 2:
if (best_workload != 0) return stopTuning = true;
if (best_workload != 0)
return stopTuning = true;
break;
case 3:
if ((c - b == limit) && (b - a == limit)) return stopTuning = true;
if ((c - b == limit) && (b - a == limit))
return stopTuning = true;
break;
default: /* Should never happen */;
}
......@@ -60,9 +77,15 @@ void Autotuner::Tune(double compTime)
{
switch (algo)
{
case 1: AlgoSimple(compTime); break;
case 2: AlgoRatio(compTime); break;
case 3: AlgoBisection(compTime); break;
case 1:
AlgoSimple(compTime);
break;
case 2:
AlgoRatio(compTime);
break;
case 3:
AlgoBisection(compTime);
break;
default: /* Should never happen */;
}
}
......@@ -121,6 +144,6 @@ void Autotuner::AlgoBisection(double compTime)
c = x;
}
x = (c-b > b-a) ? (int)(b+(c-b)/2) : (int)(a+(b-a+1)/2);
x = (c - b > b - a) ? (int) (b + (c - b) / 2) : (int) (a + (b - a + 1) / 2);
workload = x;
}
This source diff could not be displayed because it is too large. You can view the blob instead.
/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
< BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
Volker Lindenstruth and Gerhard Hummer.
Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany.
Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
Germany.
Max Planck Computing and Data Facility, Garching, Germany.
Released under the GNU Public License, v3.
......@@ -13,491 +14,185 @@
#ifndef BIOEM_ALGORITHM_H
#define BIOEM_ALGORITHM_H
//#include <boost/iterator/iterator_concepts.hpp>
#ifndef BIOEM_GPUCODE
//#define SSECODE //Explicit SSE code, not correct yet since loop counter is assumed multiple of 4, anyway not faster than autovectorized code, only implemented for float, not for double.
#endif
#ifdef SSECODE
#include <emmintrin.h>
#include <smmintrin.h>
#endif
template <int GPUAlgo>
__device__ static inline void update_prob(const myfloat_t logpro, const int iRefMap, const int iOrient, const int iConv, const int cent_x, const int cent_y, bioem_Probability& pProb, bool doAngle, myfloat_t* buf3 = NULL, int* bufint = NULL)
{
// *********** Not using FFT ALGORITHM ******************
//*********** Routine to perform the numerical BioEM intergal ***********
// ******* Summing total Probabilities *************
bioem_Probability_map& pProbMap = pProb.getProbMap(iRefMap);
// ******* Need a constant because of numerical divergence*****
if(pProbMap.Constoadd < logpro)
{
pProbMap.Total = pProbMap.Total * exp(-logpro + pProbMap.Constoadd);
pProbMap.Constoadd = logpro;
// ********** Getting parameters that maximize the probability ***********
if (GPUAlgo == 2)
{
bufint[0] = 1;
buf3[1] = logpro;
}
else
{
pProbMap.max.max_prob_cent_x = - cent_x;
pProbMap.max.max_prob_cent_y = - cent_y;
}
pProbMap.max.max_prob_orient = iOrient;
pProbMap.max.max_prob_conv = iConv;
// pProbMap.max.max_prob_norm = - ( -sumC * RefMap.sum_RefMap[iRefMap] + param.Ntotpi * value ) / ( sumC * sumC - sumsquareC * param.Ntotpi);
// pProbMap.max.max_prob_mu = - ( -sumC * value + sumsquareC * RefMap.sum_RefMap[iRefMap] ) / ( sumC * sumC - sumsquareC * param.Ntotpi);
}
if (GPUAlgo != 2) pProbMap.Total += exp(logpro - pProbMap.Constoadd);
if (doAngle)
{
bioem_Probability_angle& pProbAngle = pProb.getProbAngle(iRefMap, iOrient);
//Summing probabilities for each orientation
if(pProbAngle.ConstAngle < logpro)
{
pProbAngle.forAngles = pProbAngle.forAngles * exp(-logpro + pProbAngle.ConstAngle);
pProbAngle.ConstAngle = logpro;
}
if (GPUAlgo != 2) pProbAngle.forAngles += exp(logpro - pProbAngle.ConstAngle);
}
}
__device__ static inline myfloat_t calc_logpro(const bioem_param_device& param, const myfloat_t amp, const myfloat_t pha, const myfloat_t env, const myfloat_t sum, const myfloat_t sumsquare, const myfloat_t crossproMapConv, const myfloat_t sumref, const myfloat_t sumsquareref)
__device__ static inline myprob_t
calc_logpro(const bioem_param_device &param, const myfloat_t amp,
const myfloat_t pha, const myfloat_t env, const myfloat_t sum,
const myfloat_t sumsquare, const myfloat_t crossproMapConv,
const myfloat_t sumref, const myfloat_t sumsquareref)
{
//*** MAIN ROUTINE TO CALCULATE THE LOGPRO FOR ALL KERNELS*************//
// **** calculate the log posterior of Eq. of Pmw in SI of JSB paper ***//
// Related to Reference calculated Projection
const myfloat_t ForLogProb = (sumsquare * param.Ntotpi - sum * sum);
const myprob_t ForLogProb = sumsquare * param.Ntotpi - sum * sum;
// Products of different cross-correlations (first element in formula)
const myfloat_t firstele = param.Ntotpi * (sumsquareref * sumsquare - crossproMapConv * crossproMapConv) +
2 * sumref * sum * crossproMapConv - sumsquareref * sum * sum - sumref * sumref * sumsquare;
const myprob_t firstele =
param.Ntotpi *
(sumsquareref * sumsquare - crossproMapConv * crossproMapConv) +
2 * sumref * sum * crossproMapConv - sumsquareref * sum * sum -
sumref * sumref * sumsquare;
/// ******* Calculating log of Prob*********
// As in fortran code: logpro=(3-Ntotpi)*0.5*log(firstele/pConvMap[iOrient].ForLogProbfromConv[iConv])+(Ntotpi*0.5-2)*log(Ntotpi-2)-0.5*log(pConvMap[iOrient].ForLogProbfromConv[iConv])+0.5*log(PI)+(1-Ntotpi*0.5)*(log(2*PI)+1);
// As in fortran code:
// logpro=(3-Ntotpi)*0.5*log(firstele/pConvMap[iOrient].ForLogProbfromConv[iConv])+(Ntotpi*0.5-2)*log(Ntotpi-2)-0.5*log(pConvMap[iOrient].ForLogProbfromConv[iConv])+0.5*log(PI)+(1-Ntotpi*0.5)*(log(2*PI)+1);
myfloat_t logpro = (3 - param.Ntotpi) * 0.5 * log(firstele) + (param.Ntotpi * 0.5 - 2) * log((param.Ntotpi - 2) * ForLogProb);
myprob_t logpro =
(3 - param.Ntotpi) * 0.5 * log(firstele) +
(param.Ntotpi * 0.5 - 2) * log((param.Ntotpi - 2) * ForLogProb);
//*************Adding Gaussian Prior to envelope & Defocus parameter******************
//*************Adding Gaussian Prior to envelope & Defocus
// parameter******************
if(not param.tousepsf){
logpro = logpro - env * env / 2. / param.sigmaPriorbctf / param.sigmaPriorbctf -
(pha - param.Priordefcent ) * (pha - param.Priordefcent ) / 2. / param.sigmaPriordefo / param.sigmaPriordefo ;
} else {
myfloat_t envF,phaF;
envF = 4.* M_PI * M_PI * env / ( env * env + pha * pha) ;
phaF = 4.* M_PI * M_PI * pha / ( env * env + pha * pha);
logpro = logpro - envF * envF / 2. / param.sigmaPriorbctf / param.sigmaPriorbctf - (phaF - param.Priordefcent ) * (phaF - param.Priordefcent ) / 2. / param.sigmaPriordefo / param.sigmaPriordefo ;
if (not param.tousepsf)
{
logpro -= env * env / 2. / param.sigmaPriorbctf / param.sigmaPriorbctf -
(pha - param.Priordefcent) * (pha - param.Priordefcent) / 2. /
param.sigmaPriordefo / param.sigmaPriordefo -
(amp - param.Priorampcent) * (amp - param.Priorampcent) / 2. /
param.sigmaPrioramp / param.sigmaPrioramp;
}
else
{
myprob_t envF, phaF;
envF = 4. * M_PI * M_PI * env / (env * env + pha * pha);
phaF = 4. * M_PI * M_PI * pha / (env * env + pha * pha);
logpro -= envF * envF / 2. / param.sigmaPriorbctf / param.sigmaPriorbctf -
(phaF - param.Priordefcent) * (phaF - param.Priordefcent) / 2. /
param.sigmaPriordefo / param.sigmaPriordefo -
(amp - param.Priorampcent) * (amp - param.Priorampcent) / 2. /
param.sigmaPrioramp / param.sigmaPrioramp;
}
return(logpro);
return (logpro);
}
__device__ static inline void calProb(int iRefMap, int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, myfloat_t sumC, myfloat_t sumsquareC, myfloat_t value, int disx, int disy, bioem_Probability& pProb, const bioem_param_device& param, const bioem_RefMap& RefMap)
__device__ static inline void
calProb(int iRefMap, int iOrient, int iConv, const myfloat_t amp,
const myfloat_t pha, const myfloat_t env, const myfloat_t sumC,
const myfloat_t sumsquareC, myfloat_t value, int disx, int disy,
bioem_Probability &pProb, const bioem_param_device &param,
const bioem_RefMap &RefMap)
{
// IMPORTANT ROUTINE Summation of LogProb using FFTALGO
// ********************************************************
// *********** Calculates the BioEM probability ***********
// ********************************************************
myfloat_t logpro = calc_logpro(param, amp, pha, env, sumC, sumsquareC, value, RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
myfloat_t logpro =
calc_logpro(param, amp, pha, env, sumC, sumsquareC, value,
RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
//GCC is too stupid to inline properly, so the code is copied here
//update_prob<-1>(logpro, iRefMap, iOrient, iConv, disx, disy, pProb, param.writeAngles);
#ifdef DEBUG_PROB
printf("\t\t\tProb: iRefMap %d, iOrient %d, iConv %d, "
"disx %d, disy %d, address -, value %f, logpro %f\n",
iRefMap, iOrient, iConv, disx, disy, value, logpro);
#endif
bioem_Probability_map& pProbMap = pProb.getProbMap(iRefMap);
bioem_Probability_map &pProbMap = pProb.getProbMap(iRefMap);
// printf("Separate PtotBef: %f Const: %f logProb %f %d %d %d \n",pProbMap.Total,pProbMap.Constoadd,logpro,iRefMap,iOrient,iConv);
if(pProbMap.Constoadd < logpro)
if (pProbMap.Constoadd < logpro)
{
pProbMap.Total = pProbMap.Total * exp(-logpro + pProbMap.Constoadd);
pProbMap.Total *= exp(-logpro + pProbMap.Constoadd);
pProbMap.Constoadd = logpro;
// ********** Getting parameters that maximize the probability ***********
pProbMap.max.max_prob_cent_x = - disx;
pProbMap.max.max_prob_cent_y = - disy;
pProbMap.max.max_prob_cent_x = -disx;
pProbMap.max.max_prob_cent_y = -disy;
pProbMap.max.max_prob_orient = iOrient;
pProbMap.max.max_prob_conv = iConv;
pProbMap.max.max_prob_norm = - ( -sumC * RefMap.sum_RefMap[iRefMap] + param.Ntotpi * value ) / ( sumC * sumC - sumsquareC * param.Ntotpi);
pProbMap.max.max_prob_mu = - ( -sumC * value + sumsquareC * RefMap.sum_RefMap[iRefMap] ) / ( sumC * sumC - sumsquareC * param.Ntotpi);
pProbMap.max.max_prob_norm =
-(-sumC * RefMap.sum_RefMap[iRefMap] + param.Ntotpi * value) /
(sumC * sumC - sumsquareC * param.Ntotpi);
pProbMap.max.max_prob_mu =
-(-sumC * value + sumsquareC * RefMap.sum_RefMap[iRefMap]) /
(sumC * sumC - sumsquareC * param.Ntotpi);
#ifdef DEBUG_PROB
printf("\tProbabilities change: iRefMap %d, iOrient %d, iConv %d, Total "
"%f, Const %f, bestlogpro %f, sumExp -, bestId -\n",
iRefMap, iOrient, iConv, pProbMap.Total, pProbMap.Constoadd, logpro);
printf("\tParameters: iConv %d, myX -, myY -, disx %d, disy %d, probX "
"%d, probY %d\n",
iConv, disx, disy, pProbMap.max.max_prob_cent_x,
pProbMap.max.max_prob_cent_y);
#endif
}
pProbMap.Total += exp(logpro - pProbMap.Constoadd);
#ifdef DEBUG_PROB
printf("\t\tProbabilities after Sum: iRefMap %d, iOrient %d, iConv %d, "
"Total %f, Const %f, bestlogpro %f, sumExp -, bestId -\n",
iRefMap, iOrient, iConv, pProbMap.Total, pProbMap.Constoadd, logpro);
#endif
if (param.writeAngles)
{
bioem_Probability_angle& pProbAngle = pProb.getProbAngle(iRefMap, iOrient);
// if(iOrient==1)printf("Separate Ptot: %f Const: %f logProb %f param: %d %d %d \n",logpro,pProbAngle.ConstAngle,pProbAngle.forAngles,disx,disx,iOrient);
bioem_Probability_angle &pProbAngle = pProb.getProbAngle(iRefMap, iOrient);
if(pProbAngle.ConstAngle < logpro)
if (pProbAngle.ConstAngle < logpro)
{
pProbAngle.forAngles = pProbAngle.forAngles * exp(-logpro + pProbAngle.ConstAngle);
pProbAngle.forAngles *= exp(-logpro + pProbAngle.ConstAngle);
pProbAngle.ConstAngle = logpro;
}
pProbAngle.forAngles += exp(logpro - pProbAngle.ConstAngle);
// if(iOrient==5)printf("After separate Ptot: %f Const: %f logProb %f \n",logpro,pProbAngle.ConstAngle,pProbAngle.forAngles);
}
}
__device__ static inline void doRefMapFFT(const int iRefMap, const int iOrient, const int iConv, const myfloat_t amp, const myfloat_t pha, const myfloat_t env, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability& pProb, const bioem_param_device& param, const bioem_RefMap& RefMap)
__device__ static inline void
doRefMapFFT(const int iRefMap, const int iOrient, const int iConv,
const myfloat_t amp, const myfloat_t pha, const myfloat_t env,
const myfloat_t sumC, const myfloat_t sumsquareC,
const myfloat_t *lCC, bioem_Probability &pProb,
const bioem_param_device &param, const bioem_RefMap &RefMap)
{
//******************** Using FFT algorithm **************************
//******************* Get cross-crollation of Ical to Iobs *******************
//*********** Routine to get the Cross-Corellation from lCC for the interested center displacement *************
for (int cent_x = 0; cent_x <= param.maxDisplaceCenter; cent_x = cent_x + param.GridSpaceCenter)
{
for (int cent_y = 0; cent_y <= param.maxDisplaceCenter; cent_y = cent_y + param.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] / (myfloat_t) (param.NumberPixels * param.NumberPixels), cent_x, cent_y, pProb, param, RefMap);
}
for (int cent_y = param.NumberPixels - param.maxDisplaceCenter; cent_y < param.NumberPixels; cent_y = cent_y + param.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv,amp, pha, env, sumC, sumsquareC, (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] / (myfloat_t) (param.NumberPixels * param.NumberPixels), cent_x, cent_y - param.NumberPixels, pProb, param, RefMap);
}
}
for (int cent_x = param.NumberPixels - param.maxDisplaceCenter; cent_x < param.NumberPixels; cent_x = cent_x + param.GridSpaceCenter)
{
for (int cent_y = 0; cent_y < param.maxDisplaceCenter; cent_y = cent_y + param.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv,amp, pha, env, sumC, sumsquareC, (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] / (myfloat_t) (param.NumberPixels * param.NumberPixels), cent_x - param.NumberPixels, cent_y, pProb, param, RefMap);
}
for (int cent_y = param.NumberPixels - param.maxDisplaceCenter ; cent_y < param.NumberPixels; cent_y = cent_y + param.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] / (myfloat_t) (param.NumberPixels * param.NumberPixels), cent_x - param.NumberPixels, cent_y - param.NumberPixels, pProb, param, RefMap);
}
}
//************ The following if is not in the manual***********
if (param.writeCC)
{
// If the Cross-correlation is to be written out and stored using Bayesian analysis
int cc=0;
for (int cent_x = 0; cent_x < param.NumberPixels ; cent_x = cent_x + param.CCdisplace)
{
for (int cent_y = 0; cent_y < param.NumberPixels ; cent_y = cent_y + param.CCdisplace)
{
bioem_Probability_cc& pProbCC = pProb.getProbCC(iRefMap, cc);
myfloat_t ttmp,ttmp2;
ttmp2 = (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] / (myfloat_t) (param.NumberPixels * param.NumberPixels);
if(not param.flipped){
//Here we are inverting the sign of the cross-correlation for the images that are not flipped
ttmp=-ttmp2; }
else{ ttmp=ttmp2; }
if(!param.CCwithBayes){
// Storing Only the Maximum both for flipped and not flipped
if(pProbCC.forCC < ttmp) pProbCC.forCC = ttmp;
}else {
// Storing the Cross-correlation with Bayesian formalism
if(pProbCC.ConstCC < ttmp)
{
pProbCC.forCC = pProbCC.forCC * exp(-ttmp + pProbCC.ConstCC);
pProbCC.ConstCC = ttmp;
}
pProbCC.forCC += exp(ttmp - pProbCC.ConstCC);
}
// printf("Separate %d %d Ptot: %f Const: %f logProb %f \n",cent_x,cent_y,pProbCC.forCC,pProbCC.ConstCC,ttmp);
cc++;
}
}
}
}
template <int GPUAlgo, class RefT>
__device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const myfloat_t amp, const myfloat_t pha, const myfloat_t env, const myfloat_t sumC,
const myfloat_t sumsquareC, const myfloat_t* Mapconv, bioem_Probability& pProb, const bioem_param_device& param, const RefT& RefMap,
const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0, const bool threadActive = true)
{
//********************* Non FOURIER ALGORITHMS (refer to David) ***********
// ********************** Calculating BioEM Probability ********************************
// ************************* Loop of center displacement here ***************************
// Taking into account the center displacement
// Inizialzing crosscorrelations of calculated projected convolutions
#ifdef SSECODE
myfloat_t sum, sumsquare, crossproMapConv;
__m128 sum_v = _mm_setzero_ps(), sumsquare_v = _mm_setzero_ps(), cross_v = _mm_setzero_ps(), d1, d2;
#else
myfloat_t sum = 0.0;
myfloat_t sumsquare = 0.0;
myfloat_t crossproMapConv = 0.0;
#endif
// Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map
myfloat_t logpro;
if (GPUAlgo != 2 || threadActive)
{
int iStart, jStart, iEnd, jEnd;
if (cent_x < 0)
{
iStart = -cent_x;
iEnd = param.NumberPixels;
}
else
{
iStart = 0;
iEnd = param.NumberPixels - cent_x;
}
if (cent_y < 0)
{
jStart = -cent_y;
jEnd = param.NumberPixels;
}
else
{
jStart = 0;
jEnd = param.NumberPixels - cent_y;
}
for (int i = iStart; i < iEnd; i += 1)
{
#ifdef SSECODE
const float* ptr1 = &Mapconv.points[i + cent_x][jStart + cent_y];
const float* ptr2 = RefMap.getp(iRefMap, i, jStart);
int j;
const int count = jEnd - jStart;
for (j = 0; j <= count - 4; j += 4)
{
d1 = _mm_loadu_ps(ptr1);
d2 = _mm_loadu_ps(ptr2);
sum_v = _mm_add_ps(sum_v, d1);
sumsquare_v = _mm_add_ps(sumsquare_v, _mm_mul_ps(d1, d1));
cross_v = _mm_add_ps(cross_v, _mm_mul_ps(d1, d2));
ptr1 += 4;
ptr2 += 4;
}
#else
for (int j = jStart; j < jEnd; j += 1)
{
const myfloat_t pointMap = Mapconv[(i + cent_x) * param.NumberPixels + j + cent_y];
const myfloat_t pointRefMap = RefMap.get(iRefMap, i, j);
crossproMapConv += pointMap * pointRefMap;
// Crosscorrelation of calculated displaced map
sum += pointMap;
// Calculate Sum of pixels squared
sumsquare += pointMap * pointMap;
}
#endif
}
#ifdef SSECODE
sum_v = _mm_hadd_ps(sum_v, sum_v);
sumsquare_v = _mm_hadd_ps(sumsquare_v, sumsquare_v);
cross_v = _mm_hadd_ps(cross_v, cross_v);
sum_v = _mm_hadd_ps(sum_v, sum_v);
sumsquare_v = _mm_hadd_ps(sumsquare_v, sumsquare_v);
cross_v = _mm_hadd_ps(cross_v, cross_v);
sum = _mm_cvtss_f32(sum_v);
sumsquare = _mm_cvtss_f32(sumsquare_v);
crossproMapConv = _mm_cvtss_f32(cross_v);
#endif
// Calculating elements in BioEM Probability formula
logpro = calc_logpro(param, amp, pha, env, sum, sumsquare, crossproMapConv, RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
}
else
{
logpro = 0;
}
#ifdef BIOEM_GPUCODE
if (GPUAlgo == 2)
{
extern __shared__ myfloat_t buf[];
myfloat_t* buf2 = &buf[myBlockDimX];
myfloat_t* buf3 = &buf2[myBlockDimX + 4 * myRef];
int* bufint = (int*) buf3;
buf[myThreadIdxX] = logpro;
if (myShift == 0)
{
bufint[0] = 0;
}
__syncthreads();
//*********** Routine to get the Cross-Corellation from lCC for the interested
// center displacement *************
if (nShifts2 == CUDA_MAX_SHIFT_REDUCE) // 1024
for (int cent_x = 0; cent_x <= param.maxDisplaceCenter;
cent_x = cent_x + param.GridSpaceCenter)
{
if (myShift < 512) if (buf[myThreadIdxX + 512] > buf[myThreadIdxX]) buf[myThreadIdxX] = buf[myThreadIdxX + 512];
__syncthreads();
}
if (nShifts2 >= 512)
{
if (myShift < 256) if (buf[myThreadIdxX + 256] > buf[myThreadIdxX]) buf[myThreadIdxX] = buf[myThreadIdxX + 256];
__syncthreads();
}
if (nShifts2 >= 256)
{
if (myShift < 128) if (buf[myThreadIdxX + 128] > buf[myThreadIdxX]) buf[myThreadIdxX] = buf[myThreadIdxX + 128];
__syncthreads();
}
if (nShifts2 >= 128)
for (int cent_y = 0; cent_y <= param.maxDisplaceCenter;
cent_y = cent_y + param.GridSpaceCenter)
{
if (myShift < 64) if (buf[myThreadIdxX + 64] > buf[myThreadIdxX]) buf[myThreadIdxX] = buf[myThreadIdxX + 64];
__syncthreads();
calProb(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC,
(myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] /
(myfloat_t)(param.NumberPixels * param.NumberPixels),
cent_x, cent_y, pProb, param, RefMap);
}
if (myShift < 32) //Warp Size is 32, threads are synched automatically
{
volatile myfloat_t* vbuf = buf; //Mem must be volatile such that memory access is not reordered
if (nShifts2 >= 64 && vbuf[myThreadIdxX + 32] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 32];
if (nShifts2 >= 32 && vbuf[myThreadIdxX + 16] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 16];
if (nShifts2 >= 16 && vbuf[myThreadIdxX + 8] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 8];
if (nShifts2 >= 8 && vbuf[myThreadIdxX + 4] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 4];
if (nShifts2 >= 4 && vbuf[myThreadIdxX + 2] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 2];
if (nShifts2 >= 2 && vbuf[myThreadIdxX + 1] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 1];
if (myShift == 0 && iRefMap < RefMap.ntotRefMap)
for (int cent_y = param.NumberPixels - param.maxDisplaceCenter;
cent_y < param.NumberPixels; cent_y = cent_y + param.GridSpaceCenter)
{
const myfloat_t logpro_max = vbuf[myThreadIdxX];
update_prob<GPUAlgo>(logpro_max, iRefMap, iOrient, iConv, -1, -1, pProb, param.writeAngles, buf3, bufint);
calProb(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC,
(myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] /
(myfloat_t)(param.NumberPixels * param.NumberPixels),
cent_x, cent_y - param.NumberPixels, pProb, param, RefMap);
}
}
__syncthreads();
bioem_Probability_map& pProbMap = pProb.getProbMap(iRefMap);
bioem_Probability_angle& pProbAngle = pProb.getProbAngle(iRefMap, iOrient);
if (bufint[0] == 1 && buf3[1] == logpro && iRefMap < RefMap.ntotRefMap && atomicAdd(&bufint[0], 1) == 1)
for (int cent_x = param.NumberPixels - param.maxDisplaceCenter;
cent_x < param.NumberPixels; cent_x = cent_x + param.GridSpaceCenter)
{
pProbMap.max.max_prob_cent_x = - cent_x;