Commit aada7e38 authored by qon's avatar qon
Browse files

Add files of initial version

parents
CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
PROJECT( BioEM )
FIND_PACKAGE(CUDA)
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -march=native -fopenmp -fweb -mfpmath=sse -frename-registers -minline-all-stringops -ftracer -funroll-loops -fpeel-loops -fprefetch-loop-arrays -ffast-math -ggdb" )
SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-gencode arch=compute_35,code=sm_35;--use_fast_math;-ftz=true;-O4;-Xptxas -O4" )
INCLUDE_DIRECTORIES( include $HOME/usr/include )
CUDA_ADD_EXECUTABLE( bioEM bioem.cpp main.cpp map.cpp model.cpp param.cpp cmodules/timer.cpp bioem_cuda.cu )
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x -Wall -Wno-vla -Wno-unused-result -pedantic" )
TARGET_LINK_LIBRARIES( bioEM -lfftw3 -fopenmp -lboost_program_options )
\ No newline at end of file
********************************************************
BioEM: Bayesian inference of Electron Microscopy
********************************************************
Requisites:
**** FFTW libraries: http://www.fftw.org/
**** BOOST libraries: http://www.boost.org/
**** OpenMP: http://openmp.org/
Optional:
**** CMake: http://www.cmake.org/
for compliation with CMakeLists.txt file.
**********************************************************
This diff is collapsed.
#ifndef BIOEM_ALGORITHM_H
#define BIOEM_ALGORITHM_H
template <int GPUAlgo, class RefT>
__device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap,
const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0)
{
/**************************************************************************************/
/********************** Calculating BioEM Probability ********************************/
/************************* Loop of center displacement here ***************************/
// Taking into account the center displacement
/*** Inizialzing crosscorrelations of calculated projected convolutions ***/
myfloat_t sum=0.0;
myfloat_t sumsquare=0.0;
myfloat_t crossproMapConv=0.0;
/****** Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map***/
if (GPUAlgo != 2 || iRefMap < RefMap.ntotRefMap)
{
int iStart, jStart, iEnd, jEnd;
if (cent_x < 0)
{
iStart = -cent_x;
iEnd = param.NumberPixels;
}
else
{
iStart = 0;
iEnd = param.NumberPixels - cent_x;
}
if (cent_y < 0)
{
jStart = -cent_y;
jEnd = param.NumberPixels;
}
else
{
jStart = 0;
jEnd = param.NumberPixels - cent_y;
}
for (int i = iStart; i < iEnd; i += 1)
{
for (int j = jStart; j < jEnd; j += 1)
{
const myfloat_t pointMap = Mapconv.points[i+cent_x][j+cent_y];
const myfloat_t pointRefMap = RefMap.get(iRefMap, i, j);
crossproMapConv += pointMap * pointRefMap;
// Crosscorrelation of calculated displaced map
sum += pointMap;
// Calculate Sum of pixels squared
sumsquare += pointMap*pointMap;
}
}
}
/********** Calculating elements in BioEM Probability formula ********/
// Related to Reference calculated Projection
const myfloat_t ForLogProb = (sumsquare * param.Ntotpi - sum * sum);
// Products of different cross-correlations (first element in formula)
const myfloat_t firstele = param.Ntotpi * (RefMap.sumsquare_RefMap[iRefMap] * sumsquare-crossproMapConv * crossproMapConv) +
2 * RefMap.sum_RefMap[iRefMap] * sum * crossproMapConv - RefMap.sumsquare_RefMap[iRefMap] * sum * sum - RefMap.sum_RefMap[iRefMap] * RefMap.sum_RefMap[iRefMap] * sumsquare;
//******* Calculating log of Prob*********/
// As in fortran code: logpro=(3-Ntotpi)*0.5*log(firstele/pConvMap[iOrient].ForLogProbfromConv[iConv])+(Ntotpi*0.5-2)*log(Ntotpi-2)-0.5*log(pConvMap[iOrient].ForLogProbfromConv[iConv])+0.5*log(PI)+(1-Ntotpi*0.5)*(log(2*PI)+1);
const myfloat_t logpro = (3 - param.Ntotpi) * 0.5 * log(firstele) + (param.Ntotpi * 0.5 - 2) * log((param.Ntotpi - 2) * ForLogProb);
#ifdef BIOEM_GPUCODE
if (GPUAlgo == 2)
{
extern __shared__ myfloat_t buf[];
myfloat_t* buf2 = &buf[myBlockDimX];
myfloat_t* buf3 = &buf2[myBlockDimX + 4 * myRef];
int* bufint = (int*) buf3;
buf[myThreadIdxX] = logpro;
if (myShift == 0)
{
bufint[0] = 0;
}
__syncthreads();
if (nShifts2 == CUDA_MAX_SHIFT_REDUCE) // 1024
{
if (myShift < 512) if (buf[myThreadIdxX + 512] > buf[myThreadIdxX]) buf[myThreadIdxX] = buf[myThreadIdxX + 512];
__syncthreads();
}
if (nShifts2 >= 512)
{
if (myShift < 256) if (buf[myThreadIdxX + 256] > buf[myThreadIdxX]) buf[myThreadIdxX] = buf[myThreadIdxX + 256];
__syncthreads();
}
if (nShifts2 >= 256)
{
if (myShift < 128) if (buf[myThreadIdxX + 128] > buf[myThreadIdxX]) buf[myThreadIdxX] = buf[myThreadIdxX + 128];
__syncthreads();
}
if (nShifts2 >= 128)
{
if (myShift < 64) if (buf[myThreadIdxX + 64] > buf[myThreadIdxX]) buf[myThreadIdxX] = buf[myThreadIdxX + 64];
__syncthreads();
}
if (myShift < 32) //Warp Size is 32, threads are synched automatically
{
volatile float* vbuf = buf; //Mem must be volatile such that memory access is not reordered
if (nShifts2 >= 64 && vbuf[myThreadIdxX + 32] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 32];
if (nShifts2 >= 32 && vbuf[myThreadIdxX + 16] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 16];
if (nShifts2 >= 16 && vbuf[myThreadIdxX + 8] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 8];
if (nShifts2 >= 8 && vbuf[myThreadIdxX + 4] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 4];
if (nShifts2 >= 4 && vbuf[myThreadIdxX + 2] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 2];
if (nShifts2 >= 2 && vbuf[myThreadIdxX + 1] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 1];
if (myShift == 0 && iRefMap < RefMap.ntotRefMap)
{
const myfloat_t logpro_max = vbuf[myThreadIdxX];
if(pProb[iRefMap].Constoadd < logpro_max)
{
pProb[iRefMap].Total = pProb[iRefMap].Total * exp(-logpro_max + pProb[iRefMap].Constoadd);
pProb[iRefMap].Constoadd = logpro_max;
}
if(pProb[iRefMap].ConstAngle[iOrient] < logpro_max)
{
pProb[iRefMap].forAngles[iOrient] = pProb[iRefMap].forAngles[iOrient] * exp(-logpro_max + pProb[iRefMap].ConstAngle[iOrient]);
pProb[iRefMap].ConstAngle[iOrient] = logpro_max;
}
if(pProb[iRefMap].max_prob < logpro_max)
{
pProb[iRefMap].max_prob = logpro_max;
pProb[iRefMap].max_prob_orient = iOrient;
pProb[iRefMap].max_prob_conv = iConv;
bufint[0] = 1;
buf3[1] = logpro_max;
}
}
}
__syncthreads();
if (bufint[0] == 1 && buf3[1] == logpro && iRefMap < RefMap.ntotRefMap && atomicAdd(&bufint[0], 1) == 1)
{
pProb[iRefMap].max_prob_cent_x = cent_x;
pProb[iRefMap].max_prob_cent_y = cent_y;
}
__syncthreads();
if (iRefMap < RefMap.ntotRefMap)
{
buf[myThreadIdxX] = exp(logpro - pProb[iRefMap].Constoadd);
buf2[myThreadIdxX] = exp(logpro - pProb[iRefMap].ConstAngle[iOrient]);
}
__syncthreads();
if (nShifts2 == CUDA_MAX_SHIFT_REDUCE) // 1024
{
if (myShift < 512)
{
buf[myThreadIdxX] += buf[myThreadIdxX + 512];
buf2[myThreadIdxX] += buf2[myThreadIdxX + 512];
}
__syncthreads();
}
if (nShifts2 >= 512)
{
if (myShift < 256)
{
buf[myThreadIdxX] += buf[myThreadIdxX + 256];
buf2[myThreadIdxX] += buf2[myThreadIdxX + 256];
}
__syncthreads();
}
if (nShifts2 >= 256)
{
if (myShift < 128)
{
buf[myThreadIdxX] += buf[myThreadIdxX + 128];
buf2[myThreadIdxX] += buf2[myThreadIdxX + 128];
}
__syncthreads();
}
if (nShifts2 >= 128)
{
if (myShift < 64)
{
buf[myThreadIdxX] += buf[myThreadIdxX + 64];
buf2[myThreadIdxX] += buf2[myThreadIdxX + 64];
}
__syncthreads();
}
if (myShift < 32) //Warp Size is 32, threads are synched automatically
{
volatile float* vbuf = buf; //Mem must be volatile such that memory access is not reordered
volatile float* vbuf2 = buf2;
if (nShifts2 >= 64)
{
vbuf[myThreadIdxX] += vbuf[myThreadIdxX + 32];
vbuf2[myThreadIdxX] += vbuf2[myThreadIdxX + 32];
}
if (nShifts2 >= 32)
{
vbuf[myThreadIdxX] += vbuf[myThreadIdxX + 16];
vbuf2[myThreadIdxX] += vbuf2[myThreadIdxX + 16];
}
if (nShifts2 >= 16)
{
vbuf[myThreadIdxX] += vbuf[myThreadIdxX + 8];
vbuf2[myThreadIdxX] += vbuf2[myThreadIdxX + 8];
}
if (nShifts2 >= 8)
{
vbuf[myThreadIdxX] += vbuf[myThreadIdxX + 4];
vbuf2[myThreadIdxX] += vbuf2[myThreadIdxX + 4];
}
if (nShifts2 >= 4)
{
vbuf[myThreadIdxX] += vbuf[myThreadIdxX + 2];
vbuf2[myThreadIdxX] += vbuf2[myThreadIdxX + 2];
}
if (nShifts2 >= 2)
{
vbuf[myThreadIdxX] += vbuf[myThreadIdxX + 1];
vbuf2[myThreadIdxX] += vbuf2[myThreadIdxX + 1];
}
if (myShift == 0 && iRefMap < RefMap.ntotRefMap)
{
pProb[iRefMap].Total += vbuf[myThreadIdxX];
pProb[iRefMap].forAngles[iOrient] += vbuf2[myThreadIdxX];
}
}
}
else
#endif
/***** Summing & Storing total/Orientation Probabilites for each map ************/
{
/******* Summing total Probabilities *************/
/******* Need a constant because of numerical divergence*****/
if(pProb[iRefMap].Constoadd < logpro)
{
pProb[iRefMap].Total = pProb[iRefMap].Total * exp(-logpro + pProb[iRefMap].Constoadd);
pProb[iRefMap].Constoadd = logpro;
}
pProb[iRefMap].Total += exp(logpro - pProb[iRefMap].Constoadd);
//Summing probabilities for each orientation
if(pProb[iRefMap].ConstAngle[iOrient] < logpro)
{
pProb[iRefMap].forAngles[iOrient] = pProb[iRefMap].forAngles[iOrient] * exp(-logpro + pProb[iRefMap].ConstAngle[iOrient]);
pProb[iRefMap].ConstAngle[iOrient] = logpro;
}
pProb[iRefMap].forAngles[iOrient] += exp(logpro - pProb[iRefMap].ConstAngle[iOrient]);
/********** Getting parameters that maximize the probability ***********/
if(pProb[iRefMap].max_prob < logpro)
{
pProb[iRefMap].max_prob = logpro;
pProb[iRefMap].max_prob_cent_x = cent_x;
pProb[iRefMap].max_prob_cent_y = cent_y;
pProb[iRefMap].max_prob_orient = iOrient;
pProb[iRefMap].max_prob_conv = iConv;
}
}
}
template <int GPUAlgo, class RefT>
__device__ static inline void compareRefMapShifted(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap)
{
for (int cent_x = -param.maxDisplaceCenter; cent_x <= param.maxDisplaceCenter; cent_x=cent_x+param.GridSpaceCenter)
{
for (int cent_y = -param.maxDisplaceCenter; cent_y <= param.maxDisplaceCenter; cent_y=cent_y+param.GridSpaceCenter)
{
compareRefMap<GPUAlgo>(iRefMap, iOrient, iConv, Mapconv, pProb, param, RefMap, cent_x, cent_y);
}
}
}
#endif
#define BIOEM_GPUCODE
#if defined(_WIN32)
#include <windows.h>
#endif
#include <iostream>
using namespace std;
#include "bioem_cuda_internal.h"
#include "bioem_algorithm.h"
//__constant__ bioem_map gConvMap;
static inline void checkCudaErrors(cudaError_t error)
{
if (error != cudaSuccess)
{
printf("CUDA Error %d / %s\n", error, cudaGetErrorString(error));
exit(1);
}
}
bioem_cuda::bioem_cuda()
{
deviceInitialized = 0;
GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
}
bioem_cuda::~bioem_cuda()
{
deviceExit();
}
__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y)
{
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
if (iRefMap < RefMap->ntotRefMap)
{
compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y);
}
}
__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap)
{
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
if (iRefMap < RefMap->ntotRefMap)
{
compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap);
}
}
__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits)
{
const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
const int iRefMap = myid >> (nShiftBits << 1);
const int myRef = myThreadIdxX >> (nShiftBits << 1);
const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1);
const int myShiftIdy = myid & (nShifts - 1);
const int myShift = myid & (nShifts * nShifts - 1);
const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef);
}
template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
#if defined(_WIN32)
static inline int ilog2 (int value)
{
DWORD index;
_BitScanReverse (&index, value);
return(value);
}
#else
static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
#endif
int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map)
{
//cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream);
checkCudaErrors(cudaMemcpyAsync(pConvMap_device, &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
if (GPUAlgo == 2) //Loop over shifts
{
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
if (!IsPowerOf2(nShifts))
{
cout << "Invalid number of displacements, no power of two\n";
exit(1);
}
if (CUDA_THREAD_COUNT % (nShifts * nShifts))
{
cout << "CUDA Thread count is no multiple of number of shifts\n";
exit(1);
}
if (nShifts > CUDA_MAX_SHIFT_REDUCE)
{
cout << "Too many displacements for CUDA reduction\n";
exit(1);
}
const int nShiftBits = ilog2(nShifts);
size_t totalBlocks = divup((size_t) RefMap.ntotRefMap * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
size_t nBlocks = CUDA_BLOCK_COUNT;
for (size_t i = 0;i < totalBlocks;i += nBlocks)
{
compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device, pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits);
}
}
else if (GPUAlgo == 1) //Split shifts in multiple kernels
{
for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
{
for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
compareRefMap_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device, pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y);
}
}
}
else if (GPUAlgo == 0) //All shifts in one kernel
{
compareRefMapShifted_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device, pProb_device, param.param_device, pRefMap_device_Mod);
}
else
{
cout << "Invalid GPU Algorithm selected\n";
exit(1);
}
checkCudaErrors(cudaStreamSynchronize(cudaStream));
return(0);
}
int bioem_cuda::deviceInit()
{
deviceExit();
checkCudaErrors(cudaStreamCreate(&cudaStream));
checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
checkCudaErrors(cudaMalloc(&pConvMap_device, sizeof(bioem_map)));
pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
if (GPUAlgo == 0 || GPUAlgo == 1)
{
bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap);
cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
delete RefMapGPU;
}
else
{
cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
}
deviceInitialized = 1;
return(0);
}
int bioem_cuda::deviceExit()
{
if (deviceInitialized == 0) return(0);
cudaStreamDestroy(cudaStream);
cudaFree(pRefMap_device);
cudaFree(pProb_device);
cudaFree(pConvMap_device);
cudaThreadExit();
deviceInitialized = 0;
return(0);
}
int bioem_cuda::deviceStartRun()
{
cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyHostToDevice);
return(0);
}
int bioem_cuda::deviceFinishRun()
{
cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyDeviceToHost);
return(0);
}
bioem* bioem_cuda_create()
{
return new bioem_cuda;
}
#ifdef _WIN32
#include "pthread_mutex_win32_wrapper.h"
#else
#include <sys/types.h>
#include <sys/syscall.h>
#include <syscall.h>
#include <dirent.h>
#include <pthread.h>
#endif
#include <vector>
#include "affinity.h"
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include "os_low_level_helper.h"
#ifndef STD_OUT
#define STD_OUT stdout
#endif
pid_t gettid()
{
#ifdef _WIN32
return((pid_t) GetCurrentThreadId());
#else
return((pid_t) syscall(SYS_gettid));
#endif
}
#ifdef _WIN32
pid_t getpid()
{
return((pid_t) GetCurrentProcessId());
}
#endif
struct threadNameStruct
{
pid_t thread_id;
std::string name;
};
class lockClass
{
public:
lockClass() {pthread_mutex_init(&lock, NULL);}
~lockClass() {pthread_mutex_destroy(&lock);}
std::vector<threadNameStruct> threadNames;
pthread_mutex_t lock;
};
static lockClass lockedVector;
void setThreadName(char* name)
{
threadNameStruct tmp;
tmp.thread_id = gettid();
tmp.name = name;
pthread_mutex_lock(&lockedVector.lock);
lockedVector.threadNames.push_back(tmp);
pthread_mutex_unlock(&lockedVector.lock);
}
void setUnknownNames(char* name)
{
pid_t pid = getpid();
#ifndef _WIN32
char dirname[1024];
sprintf(dirname, "/proc/%d/task", (int) pid);
DIR* dp = opendir(dirname);
if (dp)
{
dirent* ent;
while ((ent = readdir(dp)) != NULL)
{
pid_t tid = atoi(ent->d_name);
if (tid != 0 && tid != pid)
{
int found = false;
for (size_t i = 0;i < lockedVector.threadNames.size();i++)
{
if (lockedVector.threadNames[i].thread_id == tid)
{
found = true;
}
}
if (found == false)
{
threadNameStruct tmp;
tmp.thread_id = tid;
tmp.name = name;
lockedVector.threadNames.push_back(tmp);
}
}