Commit fa6778b7 authored by Andreas Marek's avatar Andreas Marek
Browse files

Remove complex GPU call from OpenMP region

This closes issue #51.
parent 8638a970
...@@ -105,12 +105,12 @@ ...@@ -105,12 +105,12 @@
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE #ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: q(ldq,*) MATH_DATATYPE(kind=rck) :: q(ldq,*)
#else #else
MATH_DATATYPE(kind=rck) :: q(ldq,matrixCols) MATH_DATATYPE(kind=rck) :: q(ldq,matrixCols)
#endif #endif
MATH_DATATYPE(kind=rck), intent(in) :: hh_trans(:,:) MATH_DATATYPE(kind=rck), intent(in) :: hh_trans(:,:)
integer(kind=c_intptr_t) :: q_dev integer(kind=c_intptr_t) :: q_dev
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
...@@ -130,86 +130,75 @@ ...@@ -130,86 +130,75 @@
logical :: flag logical :: flag
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
MATH_DATATYPE(kind=rck), pointer :: aIntern(:,:,:,:) MATH_DATATYPE(kind=rck), pointer :: aIntern(:,:,:,:)
#else #else
MATH_DATATYPE(kind=rck), pointer :: aIntern(:,:,:) MATH_DATATYPE(kind=rck), pointer :: aIntern(:,:,:)
#endif #endif
MATH_DATATYPE(kind=rck) :: a_var MATH_DATATYPE(kind=rck) :: a_var
type(c_ptr) :: aIntern_ptr type(c_ptr) :: aIntern_ptr
MATH_DATATYPE(kind=rck) , allocatable :: row(:) MATH_DATATYPE(kind=rck) , allocatable :: row(:)
MATH_DATATYPE(kind=rck) , allocatable :: row_group(:,:) MATH_DATATYPE(kind=rck) , allocatable :: row_group(:,:)
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
#else #else
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:) MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:) MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
#endif #endif
integer(kind=c_intptr_t) :: aIntern_dev integer(kind=c_intptr_t) :: aIntern_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_intptr_t) :: num integer(kind=c_intptr_t) :: num
integer(kind=c_intptr_t) :: dev_offset, dev_offset_1, dev_offset_2 integer(kind=c_intptr_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_intptr_t) :: row_dev integer(kind=c_intptr_t) :: row_dev
integer(kind=c_intptr_t) :: row_group_dev integer(kind=c_intptr_t) :: row_group_dev
integer(kind=c_intptr_t) :: hh_tau_dev integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: hh_dot_dev integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=ik) :: row_group_size, unpack_idx integer(kind=ik) :: row_group_size, unpack_idx
integer(kind=ik) :: n_times integer(kind=ik) :: n_times
integer(kind=ik) :: top, chunk, this_chunk integer(kind=ik) :: top, chunk, this_chunk
MATH_DATATYPE(kind=rck), allocatable :: result_buffer(:,:,:) MATH_DATATYPE(kind=rck), allocatable :: result_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bcast_buffer(:,:) MATH_DATATYPE(kind=rck), allocatable :: bcast_buffer(:,:)
integer(kind=ik) :: n_off integer(kind=ik) :: n_off
integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:) integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:) integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:) integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
#ifdef WITH_OPENMP
! integer(kind=ik), allocatable :: mpi_statuses(:,:)
#endif
#ifdef WITH_OPENMP
#ifdef WITH_MPI
! integer(kind=ik) :: my_MPI_STATUS_(MPI_STATUS_SIZE)
#endif
#endif
! MPI send/recv tags, arbitrary ! MPI send/recv tags, arbitrary
integer(kind=ik), parameter :: bottom_recv_tag = 111 integer(kind=ik), parameter :: bottom_recv_tag = 111
integer(kind=ik), parameter :: top_recv_tag = 222 integer(kind=ik), parameter :: top_recv_tag = 222
integer(kind=ik), parameter :: result_recv_tag = 333 integer(kind=ik), parameter :: result_recv_tag = 333
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
integer(kind=ik) :: max_threads, my_thread integer(kind=ik) :: max_threads, my_thread
integer(kind=ik) :: omp_get_max_threads integer(kind=ik) :: omp_get_max_threads
#endif #endif
! Just for measuring the kernel performance ! Just for measuring the kernel performance
real(kind=c_double) :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double real(kind=c_double) :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
! long integer ! long integer
integer(kind=lik) :: kernel_flops, kernel_flops_recv integer(kind=lik) :: kernel_flops, kernel_flops_recv
logical, intent(in) :: wantDebug
logical, intent(in) :: wantDebug logical :: success
logical :: success integer(kind=ik) :: istat, print_flops
integer(kind=ik) :: istat, print_flops character(200) :: errorMessage
character(200) :: errorMessage logical :: successCUDA
logical :: successCUDA
#ifndef WITH_MPI #ifndef WITH_MPI
integer(kind=ik) :: j1 integer(kind=ik) :: j1
#endif #endif
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION& &PRECISION&
&_& &_&
&MATH_DATATYPE &MATH_DATATYPE
call obj%timer%start("trans_ev_tridi_to_band_& call obj%timer%start("trans_ev_tridi_to_band_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -1434,7 +1423,7 @@ ...@@ -1434,7 +1423,7 @@
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8 ! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
successCUDA = cuda_memcpy( aIntern_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)), & successCUDA = cuda_memcpy( aIntern_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)), &
stripe_width*top_msg_length* size_of_datatype, & stripe_width*top_msg_length* size_of_datatype, &
cudaMemcpyHostToDevice) cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_& print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -1485,7 +1474,7 @@ ...@@ -1485,7 +1474,7 @@
& (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
hh_dot_dev, & hh_dot_dev, &
#endif #endif
hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, i, & hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, i, &
last_stripe_width, kernel) last_stripe_width, kernel)
...@@ -1784,7 +1773,7 @@ ...@@ -1784,7 +1773,7 @@
& (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
hh_dot_dev, & hh_dot_dev, &
#endif #endif
hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, & hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, &
last_stripe_width, kernel) last_stripe_width, kernel)
...@@ -1862,7 +1851,7 @@ ...@@ -1862,7 +1851,7 @@
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), aIntern_dev + dev_offset, & successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), aIntern_dev + dev_offset, &
stripe_width*nbw * size_of_datatype, & stripe_width*nbw * size_of_datatype, &
cudaMemcpyDeviceToHost) cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_& print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -1983,13 +1972,13 @@ ...@@ -1983,13 +1972,13 @@
else ! (dst == 0) else ! (dst == 0)
if (useGPU) then if (useGPU) then
call pack_row_group_& call pack_row_group_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_gpu_& &_gpu_&
&PRECISION& &PRECISION&
&(row_group_dev, aIntern_dev, stripe_count, stripe_width, & &(row_group_dev, aIntern_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, & last_stripe_width, a_dim2, l_nev, &
result_buffer(:, :, nbuf), j * nblk + a_off, nblk) result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
else ! useGPU else ! useGPU
do i = 1, nblk do i = 1, nblk
...@@ -2338,6 +2327,7 @@ ...@@ -2338,6 +2327,7 @@
if (useGPU) then if (useGPU) then
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
! should this not hbe done always?
successCUDA = cuda_free(aIntern_dev) successCUDA = cuda_free(aIntern_dev)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree" print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
......
...@@ -63,7 +63,7 @@ ...@@ -63,7 +63,7 @@
real(kind=C_DATATYPE_KIND) :: rows(:,:) real(kind=C_DATATYPE_KIND) :: rows(:,:)
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND) :: rows(:,:) complex(kind=C_DATATYPE_KIND):: rows(:,:)
#endif #endif
integer(kind=ik) :: max_idx integer(kind=ik) :: max_idx
logical :: successCUDA logical :: successCUDA
...@@ -97,7 +97,6 @@ ...@@ -97,7 +97,6 @@
&: error in cudaMemcpy" &: error in cudaMemcpy"
stop 1 stop 1
endif endif
!write(*,*) cudaGetErrorString(istat)
end subroutine end subroutine
...@@ -133,8 +132,6 @@ ...@@ -133,8 +132,6 @@
! Issue one single transfer call for all rows (host to device) ! Issue one single transfer call for all rows (host to device)
! row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count) ! row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)
!istat = cuda_memcpy( row_group_dev , loc(rows(:, 1: row_count)),row_count * l_nev * size_of_double_real_datatype , &
! cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev * & successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev * &
size_of_& size_of_&
...@@ -150,7 +147,6 @@ ...@@ -150,7 +147,6 @@
&: error in cudaMemcpy" &: error in cudaMemcpy"
stop 1 stop 1
endif endif
!write(*,*) cudaGetErrorString(istat)
! Use one kernel call to pack the entire row group ! Use one kernel call to pack the entire row group
! call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev) ! call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)
...@@ -178,10 +174,10 @@ ...@@ -178,10 +174,10 @@
use precision use precision
implicit none implicit none
#if REALCASE == 1 #if REALCASE == 1
real(kind=C_DATATYPE_KIND) :: row_group(:,:) real(kind=C_DATATYPE_KIND) :: row_group(:,:)
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND) :: row_group(:,:) complex(kind=C_DATATYPE_KIND) :: row_group(:,:)
#endif #endif
integer(kind=c_intptr_t) :: row_group_dev, a_dev integer(kind=c_intptr_t) :: row_group_dev, a_dev
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment