Commit 5a35d015 authored by Andreas Marek's avatar Andreas Marek
Browse files

Remove unecessary device synchronize in complex case

parent 1ca304fb
...@@ -477,12 +477,14 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex* ...@@ -477,12 +477,14 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
#endif #endif
{ {
#if 0
cudaDeviceSynchronize(); cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to compute_ hh_ trafo c kernel: %s, %d\n",cudaGetErrorString(err), err); if(err != cudaSuccess) printf("error prior to compute_ hh_ trafo c kernel: %s, %d\n",cudaGetErrorString(err), err);
dim3 n_block,n_thread; dim3 n_block, n_thread;
n_block = dim3(nev,1,1); n_block = dim3(nev,1,1);
n_thread = dim3(nb,1,1); n_thread = dim3(nb,1,1);
#endif
switch (nb) switch (nb)
{ {
...@@ -491,63 +493,64 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex* ...@@ -491,63 +493,64 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
case 128: case 128:
case 64: case 64:
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
compute_hh_trafo_c_kernel_2_2_complex_double<16><<<n_block, n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols); compute_hh_trafo_c_kernel_2_2_complex_double<16><<<nev, nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#else #else
compute_hh_trafo_c_kernel_2_2_complex_single<16><<<n_block, n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols); compute_hh_trafo_c_kernel_2_2_complex_single<16><<<nev, nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#endif #endif
break; break;
case 32: case 32:
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
compute_hh_trafo_c_kernel_2_2_complex_double<8><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols); compute_hh_trafo_c_kernel_2_2_complex_double<8><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#else #else
compute_hh_trafo_c_kernel_2_2_complex_single<8><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols); compute_hh_trafo_c_kernel_2_2_complex_single<8><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#endif #endif
break; break;
case 16: case 16:
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
compute_hh_trafo_c_kernel_2_2_complex_double<4><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols); compute_hh_trafo_c_kernel_2_2_complex_double<4><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#else #else
compute_hh_trafo_c_kernel_2_2_complex_single<4><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols); compute_hh_trafo_c_kernel_2_2_complex_single<4><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#endif #endif
break; break;
case 8: case 8:
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
compute_hh_trafo_c_kernel_2_2_complex_double<2><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols); compute_hh_trafo_c_kernel_2_2_complex_double<2><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#else #else
compute_hh_trafo_c_kernel_2_2_complex_single<2><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols); compute_hh_trafo_c_kernel_2_2_complex_single<2><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#endif #endif
break; break;
case 4: case 4:
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
compute_hh_trafo_c_kernel_2_2_complex_double<1><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols); compute_hh_trafo_c_kernel_2_2_complex_double<1><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#else #else
compute_hh_trafo_c_kernel_2_2_complex_single<1><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols); compute_hh_trafo_c_kernel_2_2_complex_single<1><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#endif #endif
break; break;
case 2: case 2:
case 1: case 1:
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
compute_hh_trafo_c_kernel_2_2_complex_double<0><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols); compute_hh_trafo_c_kernel_2_2_complex_double<0><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#else #else
compute_hh_trafo_c_kernel_2_2_complex_single<0><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols); compute_hh_trafo_c_kernel_2_2_complex_single<0><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#endif #endif
break; break;
default: default:
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE.\n"); printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE.\n");
} }
#if 0
cudaDeviceSynchronize(); cudaDeviceSynchronize();
err = cudaGetLastError(); err = cudaGetLastError();
if ( err!= cudaSuccess) if ( err!= cudaSuccess)
{ {
printf("\n compute hh trafo c kernel failed %s \n",cudaGetErrorString(err) ); printf("\n compute hh trafo c kernel failed %s \n",cudaGetErrorString(err) );
} }
#endif
} }
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment