diff --git a/bioem.cpp b/bioem.cpp
index 086c8a895de8235408d4e2f1f45920ec9b191e18..c5688393aa40b87aaf9164e4c97f0c9f937999dc 100644
--- a/bioem.cpp
+++ b/bioem.cpp
@@ -81,6 +81,7 @@ bioem::bioem()
 {
 	FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO"));
 	DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT"));
+	nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE"));
 }
 
 bioem::~bioem()
@@ -361,13 +362,14 @@ int bioem::run()
 
 	// ******************************** MAIN CYCLE ******************************************
 
-	mycomplex_t* proj_mapFFT;
+	mycomplex_t* proj_mapsFFT;
 	myfloat_t* conv_map = NULL;
 	mycomplex_t* conv_mapFFT;
 	myfloat_t sumCONV, sumsquareCONV;
 
 	//allocating fftw_complex vector
-	proj_mapFFT = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D);
+	const int ProjMapSize = (param.FFTMapSize + 64) & ~63;	//Make sure this is properly aligned for fftw..., Actually this should be ensureb by using FFTMapSize, but it is not due to a bug in CUFFT which cannot handle padding properly
+	proj_mapsFFT = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * ProjMapSize * nProjectionsAtOnce);
 	conv_mapFFT = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D);
 	if (!FFTAlgo) conv_map = (myfloat_t*) myfftw_malloc(sizeof(myfloat_t) * param.param_device.NumberPixels * param.param_device.NumberPixels);
 
@@ -379,47 +381,60 @@ int bioem::run()
 	int iOrientEnd = (int) ((long long int) (mpi_rank + 1) * param.nTotGridAngles / mpi_size);
 	if (iOrientEnd > param.nTotGridAngles) iOrientEnd = param.nTotGridAngles;
 
-	for (int iOrient = iOrientStart; iOrient < iOrientEnd; iOrient++)
+	for (int iOrientAtOnce = iOrientStart; iOrientAtOnce < iOrientEnd; iOrientAtOnce += nProjectionsAtOnce)
 	{
 		// ***************************************************************************************
 		// ***** Creating Projection for given orientation and transforming to Fourier space *****
 		if (DebugOutput >= 1) timer2.ResetStart();
 		if (DebugOutput >= 2) timer.ResetStart();
-		createProjection(iOrient, proj_mapFFT);
-		if (DebugOutput >= 2) printf("\tTime Projection %d: %f (rank %d)\n", iOrient, timer.GetCurrentElapsedTime(), mpi_rank);
+		int iTmpEnd = std::min(iOrientEnd, iOrientAtOnce + nProjectionsAtOnce);
+#pragma omp parallel for
+		for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++)
+		{
+			createProjection(iOrient, &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize]);
+		}
+		if (DebugOutput >= 2) printf("\tTime Projection %d: %f (rank %d)\n", iOrientAtOnce, timer.GetCurrentElapsedTime(), mpi_rank);
 
-		// ***************************************************************************************
-		// ***** **** Internal Loop over convolutions **** *****
-		for (int iConv = 0; iConv < param.nTotCTFs; iConv++)
+		for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++)
 		{
-			// *** Calculating convolutions of projection map and crosscorrelations ***
+			mycomplex_t* proj_mapFFT = &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize];
+			// ***************************************************************************************
+			// ***** **** Internal Loop over convolutions **** *****
+			for (int iConv = 0; iConv < param.nTotCTFs; iConv++)
+			{
+				// *** Calculating convolutions of projection map and crosscorrelations ***
 
-			if (DebugOutput >= 2) timer.ResetStart();
-			createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
-			if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank);
+				if (DebugOutput >= 2) timer.ResetStart();
+				createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
+				if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank);
 
-			// ***************************************************************************************
-			// *** Comparing each calculated convoluted map with all experimental maps ***
-			if (DebugOutput >= 2) timer.ResetStart();
-			compareRefMaps(iOrient, iConv, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
+				// ***************************************************************************************
+				// *** Comparing each calculated convoluted map with all experimental maps ***
+				if (DebugOutput >= 2) timer.ResetStart();
+				compareRefMaps(iOrient, iConv, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
 
-			if (DebugOutput >= 2)
+				if (DebugOutput >= 2)
+				{
+					const double compTime = timer.GetCurrentElapsedTime();
+					const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
+					const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
+										  (((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime;
+					const double nGBs = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
+										(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / compTime;
+					const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / compTime;
+
+					printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
+				}
+			}
+			if (DebugOutput >= 1)
 			{
-				const double compTime = timer.GetCurrentElapsedTime();
-				const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
-				const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
-									  (((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime;
-				const double nGBs = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
-									(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / compTime;
-				const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / compTime;
-
-				printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
+				printf("\tTotal time for projection %d: %f (rank %d)\n", iOrient, timer2.GetCurrentElapsedTime(), mpi_rank);
+				timer2.ResetStart();
 			}
 		}
-		if (DebugOutput >= 1) printf("\tTotal time for projection %d: %f (rank %d)\n", iOrient, timer2.GetCurrentElapsedTime(), mpi_rank);
 	}
 	//deallocating fftw_complex vector
-	myfftw_free(proj_mapFFT);
+	myfftw_free(proj_mapsFFT);
 	myfftw_free(conv_mapFFT);
 	if (!FFTAlgo) myfftw_free(conv_map);
 
@@ -626,7 +641,7 @@ int bioem::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map, myc
 	if (FFTAlgo)
 	{
 		//With FFT Algorithm
-		#pragma omp parallel for
+		#pragma omp parallel for schedule(dynamic, 1)
 		for (int iRefMap = startMap; iRefMap < RefMap.ntotRefMap; iRefMap ++)
 		{
 			const int num = omp_get_thread_num();
@@ -636,7 +651,7 @@ int bioem::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map, myc
 	else
 	{
 		//Without FFT Algorithm
-		#pragma omp parallel for
+		#pragma omp parallel for schedule(dynamic, 1)
 		for (int iRefMap = startMap; iRefMap < RefMap.ntotRefMap; iRefMap ++)
 		{
 			compareRefMapShifted < -1 > (iRefMap, iOrient, iConv, conv_map, pProb, param.param_device, RefMap);
@@ -716,52 +731,52 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
 	int i, j;
 
 	//********** Projection with radius ***************
-	if(param.doaaradius){
-
-	        int irad;
-        	myfloat_t dist, rad2;
+	if(param.doaaradius)
+	{
+		int irad;
+		myfloat_t dist, rad2;
 
-          	for(int n = 0; n < Model.nPointsModel; n++)
-                {
-                	//Getting Centers of Sphere
-               		i = floor(RotatedPointsModel[n].pos[0] / param.pixelSize + (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f);
-		        j = floor(RotatedPointsModel[n].pos[1] / param.pixelSize + (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f);
+		for(int n = 0; n < Model.nPointsModel; n++)
+		{
+			//Getting Centers of Sphere
+			i = floor(RotatedPointsModel[n].pos[0] / param.pixelSize + (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f);
+			j = floor(RotatedPointsModel[n].pos[1] / param.pixelSize + (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f);
 			//Getting the radius
 			irad=int( Model.points[n].radius / param.pixelSize );
-                	rad2= Model.points[n].radius * Model.points[n].radius;
+			rad2= Model.points[n].radius * Model.points[n].radius;
 
-               		//Projecting over the radius
-                	for(int ii= i - irad; ii < i + irad; ii++)
+			//Projecting over the radius
+			for(int ii= i - irad; ii < i + irad; ii++)
 			{	
 				for(int jj = j - irad; jj < j + irad; jj++)
 				{
-                       			 dist= ( (myfloat_t) (ii-i)*(ii-i)+(jj-j)*(jj-j) ) *  param.pixelSize ;
-	                       		 if( dist < rad2 )
+					dist= ( (myfloat_t) (ii-i)*(ii-i)+(jj-j)*(jj-j) ) *  param.pixelSize ;
+					if( dist < rad2 )
 					{
-					 localproj[ii * param.param_device.NumberPixels + jj] += param.pixelSize * 2 * sqrt( rad2 - dist ) * Model.points[n].density
-					 / Model.NormDen * 3 / (4 * M_PI * rad2 *  Model.points[n].radius); 	
+						localproj[ii * param.param_device.NumberPixels + jj] += param.pixelSize * 2 * sqrt( rad2 - dist ) * Model.points[n].density
+							/ Model.NormDen * 3 / (4 * M_PI * rad2 *  Model.points[n].radius); 	
 					}
-                 		}
-                	}
+				}
+			}
 		}
 	}
 	else 
 	{
-	// ************ Simple Projection over the Z axis********************
+		// ************ Simple Projection over the Z axis********************
 		for(int n = 0; n < Model.nPointsModel; n++)
 		{
- 		//Getting pixel that represents coordinates & shifting the start at to Numpix/2,Numpix/2 )
-		i = floor(RotatedPointsModel[n].pos[0] / param.pixelSize + (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f);
-		j = floor(RotatedPointsModel[n].pos[1] / param.pixelSize + (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f);
+			//Getting pixel that represents coordinates & shifting the start at to Numpix/2,Numpix/2 )
+			i = floor(RotatedPointsModel[n].pos[0] / param.pixelSize + (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f);
+			j = floor(RotatedPointsModel[n].pos[1] / param.pixelSize + (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f);
 
-		if (i < 0 || j < 0 || i >= param.param_device.NumberPixels || j >= param.param_device.NumberPixels)
-		{
-			if (DebugOutput >= 3) cout << "Model Point out of map: " << i << ", " << j << "\n";
-			continue;
-		}
+			if (i < 0 || j < 0 || i >= param.param_device.NumberPixels || j >= param.param_device.NumberPixels)
+			{
+				if (DebugOutput >= 3) cout << "Model Point out of map: " << i << ", " << j << "\n";
+				continue;
+			}
 
-		localproj[i * param.param_device.NumberPixels + j] += Model.points[n].density / Model.NormDen;
-         	}
+			localproj[i * param.param_device.NumberPixels + j] += Model.points[n].density / Model.NormDen;
+		}
 	}
 
 	// **** Output Just to check****
diff --git a/include/bioem.h b/include/bioem.h
index 7f673ddbb4c9e39f7164c40fba0113d90ffdc09b..4f4f12a41f3c76fafce9a534fefe5a16ec9b40a7 100644
--- a/include/bioem.h
+++ b/include/bioem.h
@@ -51,8 +51,9 @@ protected:
 	int nProjectionMaps;		//Maps in memory at a time
 	int nProjectionMapsTotal;	//Maps in total
 
-	int FFTAlgo;
-	int DebugOutput;
+	int FFTAlgo;				//Use the FFT Algorithm (Default 1)
+	int DebugOutput;			//Debug Output Level (Default 2)
+	int nProjectionsAtOnce;		//Number of projections to do at once via OpenMP (Default 1)
 };
 
 #endif
diff --git a/param.cpp b/param.cpp
index 55d9dc085ce4d87be4587a453d5fb486659f7191..079ea66dab1100bea4d3ba85a7e5c697687a6ec3 100644
--- a/param.cpp
+++ b/param.cpp
@@ -277,7 +277,6 @@ int bioem_param::readParameters(const char* fileinput)
 		exit(1);
 	}
 
-
 	param_device.NumberFFTPixels1D = param_device.NumberPixels / 2 + 1;
 	FFTMapSize = param_device.NumberPixels * param_device.NumberFFTPixels1D;