Commit 5b55ad7a authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'GPU_in_openmp_code_path' into 'master_pre_stage'

Gpu in openmp code path

See merge request !56
parents 4ff2f9e6 ea6daa06
...@@ -57,7 +57,7 @@ ...@@ -57,7 +57,7 @@
#include "cuUtils_template.cu" #include "cuUtils_template.cu"
#undef DOUBLE_PRECISION_REAL #undef DOUBLE_PRECISION_REAL
#if WANT_SINGLE_PRECISION_REAL #ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL #undef DOUBLE_PRECISION_REAL
#include "cuUtils_template.cu" #include "cuUtils_template.cu"
...@@ -71,7 +71,7 @@ ...@@ -71,7 +71,7 @@
#include "cuUtils_template.cu" #include "cuUtils_template.cu"
#undef DOUBLE_PRECISION_COMPLEX #undef DOUBLE_PRECISION_COMPLEX
#if WANT_SINGLE_PRECISION_COMPLEX #ifdef WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX #undef DOUBLE_PRECISION_COMPLEX
#include "cuUtils_template.cu" #include "cuUtils_template.cu"
......
...@@ -137,7 +137,7 @@ module elpa1_auxiliary_impl ...@@ -137,7 +137,7 @@ module elpa1_auxiliary_impl
#undef DOUBLE_PRECISION #undef DOUBLE_PRECISION
#undef REALCASE #undef REALCASE
#if WANT_SINGLE_PRECISION_REAL #ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1 #define REALCASE 1
#define SINGLE_PRECISION #define SINGLE_PRECISION
#include "../general/precision_macros.h" #include "../general/precision_macros.h"
...@@ -287,7 +287,7 @@ module elpa1_auxiliary_impl ...@@ -287,7 +287,7 @@ module elpa1_auxiliary_impl
#undef DOUBLE_PRECISION #undef DOUBLE_PRECISION
#undef REALCASE #undef REALCASE
#if WANT_SINGLE_PRECISION_REAL #ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1 #define REALCASE 1
#define SINGLE_PRECISION #define SINGLE_PRECISION
#include "../general/precision_macros.h" #include "../general/precision_macros.h"
......
...@@ -279,8 +279,6 @@ function elpa_solve_evp_& ...@@ -279,8 +279,6 @@ function elpa_solve_evp_&
! restore original OpenMP settings ! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller) call omp_set_num_threads(omp_threads_caller)
#endif #endif
call obj%timer%stop("elpa_solve_evp_& call obj%timer%stop("elpa_solve_evp_&
...@@ -356,6 +354,18 @@ function elpa_solve_evp_& ...@@ -356,6 +354,18 @@ function elpa_solve_evp_&
success = .false. success = .false.
return return
endif endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call obj%get("omp_threads", nrThreads, error)
if (nrThreads .ne. 1) then
print *,"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print *,"setting 1 openmp thread now"
call obj%set("omp_threads",1, error)
nrThreads=1
call omp_set_num_threads(nrThreads)
endif
#endif
call obj%timer%stop("check_for_gpu") call obj%timer%stop("check_for_gpu")
endif endif
...@@ -589,8 +599,6 @@ function elpa_solve_evp_& ...@@ -589,8 +599,6 @@ function elpa_solve_evp_&
#endif #endif
! restore original OpenMP settings ! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller) call omp_set_num_threads(omp_threads_caller)
#endif #endif
......
...@@ -612,26 +612,28 @@ subroutine tridiag_& ...@@ -612,26 +612,28 @@ subroutine tridiag_&
if (l_row_end < l_row_beg) cycle if (l_row_end < l_row_beg) cycle
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
if (mod(n_iter,n_threads) == my_thread) then if (mod(n_iter,n_threads) == my_thread) then
if (wantDebug) call obj%timer%start("blas") if (.not. useGPU) then
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, & if (wantDebug) call obj%timer%start("blas")
int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), & call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), & int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND, ONE, uc_p(l_col_beg,my_thread), 1_BLAS_KIND) ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND, ONE, uc_p(l_col_beg,my_thread), 1_BLAS_KIND)
if (i/=j) then
if (isSkewsymmetric) then if (i/=j) then
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), & if (isSkewsymmetric) then
int(l_col_end-l_col_beg+1,kind=BLAS_KIND), & call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
-ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), & int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, & -ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND) v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
else
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), & else
int(l_col_end-l_col_beg+1,kind=BLAS_KIND), & call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), & int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, & ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND) v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
endif
endif endif
endif endif
if (wantDebug) call obj%timer%stop("blas") if (wantDebug) call obj%timer%stop("blas")
...@@ -750,11 +752,12 @@ subroutine tridiag_& ...@@ -750,11 +752,12 @@ subroutine tridiag_&
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
!$OMP END PARALLEL !$OMP END PARALLEL
call obj%timer%stop("OpenMP parallel") call obj%timer%stop("OpenMP parallel")
if (.not.(useGPU)) then
do i=0,max_threads-1 do i=0,max_threads-1
u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i) u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i) u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
enddo enddo
endif
#endif /* WITH_OPENMP_TRADITIONAL */ #endif /* WITH_OPENMP_TRADITIONAL */
! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v ! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
......
...@@ -134,8 +134,11 @@ subroutine elpa_reduce_add_vectors_& ...@@ -134,8 +134,11 @@ subroutine elpa_reduce_add_vectors_&
aux2(:) = 0 aux2(:) = 0
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
!call omp_set_num_threads(nrThreads) !call omp_set_num_threads(nrThreads)
!$omp parallel &
!$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads) !$omp default(none) &
!$omp private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads) &
!$omp shared(nps, npt, lcm_s_t, nblk, vmat_t, vmat_s, myps, mypt, mpierr, obj, &
!$omp& comm_t, nblks_tot, aux2, aux1, nvr, nvc)
#endif #endif
do n = 0, lcm_s_t-1 do n = 0, lcm_s_t-1
......
...@@ -148,7 +148,11 @@ subroutine ROUTINE_NAME& ...@@ -148,7 +148,11 @@ subroutine ROUTINE_NAME&
allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ), stat=istat, errmsg=errorMessage) allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ), stat=istat, errmsg=errorMessage)
check_allocate("elpa_transpose_vectors: aux", istat, errorMessage) check_allocate("elpa_transpose_vectors: aux", istat, errorMessage)
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) !$omp parallel &
!$omp default(none) &
!$omp private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) &
!$omp shared(nps, npt, lcm_s_t, mypt, nblk, myps, vmat_t, mpierr, comm_s, &
!$omp& obj, vmat_s, aux, nblks_skip, nblks_tot, nvc, nvr)
#endif #endif
do n = 0, lcm_s_t-1 do n = 0, lcm_s_t-1
......
...@@ -131,7 +131,11 @@ subroutine elpa_transpose_vectors_ss_& ...@@ -131,7 +131,11 @@ subroutine elpa_transpose_vectors_ss_&
allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc )) allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
check_allocate("elpa_transpose_vectors_ss: aux", istat, errorMessage) check_allocate("elpa_transpose_vectors_ss: aux", istat, errorMessage)
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) !$omp parallel &
!$omp default(none) &
!$omp private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) &
!$omp shared(nps, npt, lcm_s_t, mypt, nblk, myps, vmat_t, mpierr, comm_s, &
!$omp& obj, vmat_s, aux, nblks_skip, nblks_tot, nvc, nvr)
#endif #endif
do n = 0, lcm_s_t-1 do n = 0, lcm_s_t-1
......
...@@ -58,11 +58,10 @@ l_nev, & ...@@ -58,11 +58,10 @@ l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, & hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
my_thread, thread_width, & my_thread, thread_width, kernel, last_stripe_width)
#else #else
last_stripe_width, & last_stripe_width, kernel)
#endif #endif
kernel)
use precision use precision
use elpa_abstract_impl use elpa_abstract_impl
...@@ -141,6 +140,7 @@ kernel) ...@@ -141,6 +140,7 @@ kernel)
#else /* WITH_OPENMP_TRADITIONAL */ #else /* WITH_OPENMP_TRADITIONAL */
integer(kind=ik), intent(in) :: l_nev, thread_width integer(kind=ik), intent(in) :: l_nev, thread_width
integer(kind=ik), intent(in), optional :: last_stripe_width
#if REALCASE == 1 #if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads) ! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:) real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:)
...@@ -221,54 +221,39 @@ kernel) ...@@ -221,54 +221,39 @@ kernel)
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
if (my_thread==1) then if (my_thread==1) then ! in the calling routine threads go form 1 .. max_threads
#endif #endif
ttt = mpi_wtime() ttt = mpi_wtime()
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
endif endif
#endif #endif
#ifdef WITH_OPENMP_TRADITIONAL
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif
#endif /* WITH_OPENMP_TRADITIONAL */
#ifndef WITH_OPENMP_TRADITIONAL #ifndef WITH_OPENMP_TRADITIONAL
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count) nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else /* WITH_OPENMP_TRADITIONAL */ #else /* WITH_OPENMP_TRADITIONAL */
if (istripe<stripe_count) then if (present(last_stripe_width)) then
nl = stripe_width nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
else else
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width if (istripe<stripe_count) then
nl = min(my_thread*thread_width-noff, l_nev-noff) nl = stripe_width
if (nl<=0) then else
if (wantDebug) call obj%timer%stop("compute_hh_trafo_& noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
&MATH_DATATYPE& nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then
if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
&_openmp" // & &_openmp" // &
#else #else
&" // & &" // &
#endif #endif
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
return return
endif
endif endif
endif endif
#endif /* not WITH_OPENMP_TRADITIONAL */ #endif /* not WITH_OPENMP_TRADITIONAL */
......
...@@ -629,65 +629,65 @@ max_threads) ...@@ -629,65 +629,65 @@ max_threads)
aux1 = 0.0_rck aux1 = 0.0_rck
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
#if 0 !#if 0
! original complex implementation without openmp. check performance ! ! original complex implementation without openmp. check performance
nlc = 0 ! number of local columns ! nlc = 0 ! number of local columns
do j=1,lc-1 ! do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) ! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then ! if (lcx>0) then
nlc = nlc+1 ! nlc = nlc+1
aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx)) ! aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
endif ! endif
enddo ! enddo
! Get global dot products
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
endif
enddo
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
endif
enddo
#endif /* WITH_MPI */
! !
! ! Transform ! ! Get global dot products
!#ifdef WITH_MPI
! if (wantDebug) call obj%timer%start("mpi_communication")
! if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
! int(mpi_comm_rows,kind=MPI_KIND), mpierr)
! !
! nlc = 0 ! ! Transform
! do j=1,lc-1 !
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) ! nlc = 0
! if (lcx>0) then ! do j=1,lc-1
! nlc = nlc+1 ! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) ! if (lcx>0) then
! nlc = nlc+1
! endif ! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! enddo !
#endif /* if 0 */ ! endif
! enddo
!
!
! if (wantDebug) call obj%timer%stop("mpi_communication")
!
!#else /* WITH_MPI */
!
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
! endif
! enddo
!
!#endif /* WITH_MPI */
!!
!! ! Transform
!!
!! nlc = 0
!! do j=1,lc-1
!! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!! if (lcx>0) then
!! nlc = nlc+1
!! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
!
!! endif
!! enddo
!#endif /* if 0 */
!Open up one omp region to avoid paying openmp overhead. !Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call, !This does not help performance due to the addition of two openmp barriers around the MPI call,
...@@ -909,35 +909,35 @@ max_threads) ...@@ -909,35 +909,35 @@ max_threads)
! of the tiles, so we can use strips of the matrix ! of the tiles, so we can use strips of the matrix
#if 0 !#if 0
! original complex implemetation check for performance ! ! original complex implemetation check for performance
umcCPU(1:l_cols,1:n_cols) = 0.0_rck ! umcCPU(1:l_cols,1:n_cols) = 0.0_rck
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck ! vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
!
if (l_cols>0 .and. l_rows>0) then ! if (l_cols>0 .and. l_rows>0) then
do i=0,(istep*nbw-1)/tile_size ! do i=0,(istep*nbw-1)/tile_size
!
lcs = i*l_cols_tile+1 ! lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile) ! lce = min(l_cols,(i+1)*l_cols_tile)
if (lce<lcs) cycle ! if (lce<lcs) cycle
!
lre = min(l_rows,(i+1)*l_rows_tile) ! lre = min(l_rows,(i+1)*l_rows_tile)
!
call obj%timer%start("blas") ! call obj%timer%start("blas")
call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a_mat(1,lcs), ubound(a_mat,dim=1), & ! call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a_mat(1,lcs), ubound(a_mat,dim=1), &
vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1)) ! vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
call obj%timer%stop("blas") ! call obj%timer%stop("blas")
!
if (i==0) cycle ! if (i==0) cycle
lre = min(l_rows,i*l_rows_tile) ! lre = min(l_rows,i*l_rows_tile)
call obj%timer%start("blas") ! call obj%timer%start("blas")
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, & ! call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1)) ! umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
call obj%timer%stop("blas") ! call obj%timer%stop("blas")
enddo ! enddo
!
endif ! (l_cols>0 .and. l_rows>0) ! endif ! (l_cols>0 .and. l_rows>0)
#endif /* if 0 */ !#endif /* if 0 */
!Code for Algorithm 4 !Code for Algorithm 4
...@@ -1396,7 +1396,11 @@ max_threads) ...@@ -1396,7 +1396,11 @@ max_threads)
! A = A - V*U**T - U*V**T ! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend ) !$omp parallel &
!$omp default(none) &
!$omp private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend ) &
!$omp shared(n_threads, istep, tile_size, nbw, n_cols, obj, vmrcpu, l_cols_tile, l_rows, l_rows_tile, &
!$omp& umccpu, l_cols, a_dev, vmr_dev, useGPU, cur_l_rows, umc_dev, cur_l_cols, lda )
n_threads = omp_get_num_threads() n_threads = omp_get_num_threads()
if (mod(n_threads, 2) == 0) then if (mod(n_threads, 2) == 0) then
...@@ -1424,13 +1428,30 @@ max_threads) ...@@ -1424,13 +1428,30 @@ max_threads)
myend = mystart + work_per_thread - 1 myend = mystart + work_per_thread - 1
if ( myend > lre ) myend = lre if ( myend > lre ) myend = lre
if ( myend-mystart+1 < 1) cycle if ( myend-mystart+1 < 1) cycle
call obj%timer%start("blas") if (useGPU) then
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, int(myend-mystart+1,kind=BLAS_KIND), & if (n_way .gt. 1) then
int(lce-lcs+1,kind=BLAS_KIND), int(2*n_cols,kind=BLAS_KIND), -ONE, & print *,"error more than 1 openmp thread used in GPU part of elpa2_bandred"
vmrCPU(mystart, 1), int(ubound(vmrCPU,1),kind=BLAS_KIND), & print *,"this should never happen"
umcCPU(lcs,1), int(ubound(umcCPU,1),kind=BLAS_KIND), & stop
ONE, a_mat(mystart,lcs), int(ubound(a_mat,1),kind=BLAS_KIND) ) endif
call obj%timer%stop("blas") call obj%timer%start("cublas")
call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, myend-mystart+1, &
lce-lcs+1, 2*n_cols, -ONE, &
vmr_dev, cur_l_rows, (umc_dev +(lcs-1)* &
size_of_datatype), &
cur_l_cols, ONE, (a_dev+(lcs-1)*lda* &
size_of_datatype), lda)
call obj%timer%stop("cublas")
else
call obj%timer%start("blas")
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, int(myend-mystart+1,kind=BLAS_KIND), &
int(lce-lcs+1,kind=BLAS_KIND), int(2*n_cols,kind=BLAS_KIND), -ONE, &
vmrCPU(mystart, 1), int(ubound(vmrCPU,1),kind=BLAS_KIND), &
umcCPU(lcs,1), int(ubound(umcCPU,1),kind=BLAS_KIND), &
ONE, a_mat(mystart,lcs), int(ubound(a_mat,1),kind=BLAS_KIND) )