Commit 1151f6a5 authored by Andreas Marek's avatar Andreas Marek

Move GPU part "contains" functions in separate module

parent a0934d4e
...@@ -18,6 +18,7 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \ ...@@ -18,6 +18,7 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/mod_cuda.F90 \ src/mod_cuda.F90 \
src/interface_c_kernel.F90 \ src/interface_c_kernel.F90 \
src/mod_pack_unpack_real.F90 \ src/mod_pack_unpack_real.F90 \
src/mod_pack_unpack_real_gpu.F90 \
src/elpa2_kernels/mod_single_hh_trafo_real.F90 \ src/elpa2_kernels/mod_single_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_real.F90 \ src/mod_compute_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_complex.F90 \ src/mod_compute_hh_trafo_complex.F90 \
......
...@@ -2475,6 +2475,7 @@ module ELPA2_compute ...@@ -2475,6 +2475,7 @@ module ELPA2_compute
use cuda_functions use cuda_functions
use precision use precision
use pack_unpack_real use pack_unpack_real
use pack_unpack_real_gpu
use compute_hh_trafo_real use compute_hh_trafo_real
implicit none implicit none
logical, intent(in) :: useGPU logical, intent(in) :: useGPU
...@@ -2796,7 +2797,9 @@ module ELPA2_compute ...@@ -2796,7 +2797,9 @@ module ELPA2_compute
if (useGPU) then if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row ! An unpacking of the current row group may occur before queuing the next row
call unpack_and_prepare_row_group_real_gpu(i - limits(ip), .false.) call unpack_and_prepare_row_group_real_gpu(row_group, row_group_dev, a_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, i - limits(ip), .false.)
call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
else else
call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
...@@ -2832,7 +2835,9 @@ module ELPA2_compute ...@@ -2832,7 +2835,9 @@ module ELPA2_compute
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
if (useGPU) then if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row ! An unpacking of the current row group may occur before queuing the next row
call unpack_and_prepare_row_group_real_gpu(i - limits(ip), .false.) call unpack_and_prepare_row_group_real_gpu(row_group, row_group_dev, a_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, i - limits(ip), .false.)
row_group(:, row_group_size) = q(src_offset, 1:l_nev) row_group(:, row_group_size) = q(src_offset, 1:l_nev)
else else
call unpack_row_real_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) call unpack_row_real_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
...@@ -2889,7 +2894,9 @@ module ELPA2_compute ...@@ -2889,7 +2894,9 @@ module ELPA2_compute
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
if (useGPU) then if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row ! An unpacking of the current row group may occur before queuing the next row
call unpack_and_prepare_row_group_real_gpu(i - limits(my_prow), .false.) call unpack_and_prepare_row_group_real_gpu(row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, i - limits(my_prow), .false.)
call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
else else
call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
...@@ -2905,7 +2912,8 @@ module ELPA2_compute ...@@ -2905,7 +2912,8 @@ module ELPA2_compute
if (useGPU) then if (useGPU) then
! Force an unpacking of all remaining rows that haven't been unpacked yet ! Force an unpacking of all remaining rows that haven't been unpacked yet
call unpack_and_prepare_row_group_real_gpu(-1, .true.) call unpack_and_prepare_row_group_real_gpu(row_group, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
a_dim2, l_nev, row_group_size, nblk, unpack_idx, -1, .true.)
successCUDA = cuda_devicesynchronize() successCUDA = cuda_devicesynchronize()
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
...@@ -3160,8 +3168,8 @@ module ELPA2_compute ...@@ -3160,8 +3168,8 @@ module ELPA2_compute
stop stop
endif endif
call extract_hh_tau_real_gpu(nbw, current_local_n, .false.) call extract_hh_tau_real_gpu(bcast_buffer_dev, hh_tau_dev, nbw, current_local_n, .false.)
call compute_hh_dot_products_real_gpu(nbw, current_local_n) call compute_hh_dot_products_real_gpu(bcast_buffer_dev, hh_dot_dev, nbw, current_local_n)
endif endif
else else
...@@ -3175,7 +3183,7 @@ module ELPA2_compute ...@@ -3175,7 +3183,7 @@ module ELPA2_compute
stop stop
endif endif
call extract_hh_tau_real_gpu(nbw, 1, .true.) call extract_hh_tau_real_gpu(bcast_buffer_dev, hh_tau_dev, nbw, 1, .true.)
endif endif
endif endif
...@@ -3644,7 +3652,8 @@ module ELPA2_compute ...@@ -3644,7 +3652,8 @@ module ELPA2_compute
if (dst == 0) then if (dst == 0) then
if (useGPU) then if (useGPU) then
row_group_size = min(na - num_blk*nblk, nblk) row_group_size = min(na - num_blk*nblk, nblk)
call pack_row_group_real_gpu(row_group(:, :), j * nblk + a_off, row_group_size) call pack_row_group_real_gpu(row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
row_group(:, :), j * nblk + a_off, row_group_size)
do i = 1, row_group_size do i = 1, row_group_size
q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i) q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i)
...@@ -3652,13 +3661,14 @@ module ELPA2_compute ...@@ -3652,13 +3661,14 @@ module ELPA2_compute
else else
do i = 1, min(na - num_blk*nblk, nblk) do i = 1, min(na - num_blk*nblk, nblk)
call pack_row_real_cpu(row, j*nblk+i+a_off) call pack_row_real_cpu(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:) q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
enddo enddo
endif endif
else else
if (useGPU) then if (useGPU) then
call pack_row_group_real_gpu(result_buffer(:, :, nbuf), j * nblk + a_off, nblk) call pack_row_group_real_gpu(row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
else else
do i = 1, nblk do i = 1, nblk
call pack_row_real_cpu(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) call pack_row_real_cpu(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
...@@ -3963,167 +3973,6 @@ module ELPA2_compute ...@@ -3963,167 +3973,6 @@ module ELPA2_compute
call timer%stop("trans_ev_tridi_to_band_real") call timer%stop("trans_ev_tridi_to_band_real")
#endif #endif
return return
contains
! Pack a filled row group (i.e. an array of consecutive rows)
subroutine pack_row_group_real_gpu(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: n_offset, row_count
real(kind=rk) :: rows(:,:)
integer(kind=ik) :: max_idx
logical :: successCUDA
! Use many blocks for higher GPU occupancy
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
! Use one kernel call to pack the entire row group
! call my_pack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, a_dev, row_group_dev)
call launch_my_pack_c_kernel_real(row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, &
l_nev, a_dev, row_group_dev)
! Issue one single transfer call for all rows (device to host)
! rows(:, 1 : row_count) = row_group_dev(:, 1 : row_count)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev , row_count * l_nev * size_of_real_datatype , &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"pack_row_group_real_gpu: error in cudaMemcpy"
stop
endif
!write(*,*) cudaGetErrorString(istat)
end subroutine
! Unpack a filled row group (i.e. an array of consecutive rows)
subroutine unpack_row_group_real_gpu(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: n_offset, row_count
real(kind=rk), intent(in) :: rows(:, :)
integer(kind=ik) :: max_idx
integer(kind=ik) :: i
logical :: successCUA
! Use many blocks for higher GPU occupancy
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
! Issue one single transfer call for all rows (host to device)
! row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)
!istat = cuda_memcpy( row_group_dev , loc(rows(:, 1: row_count)),row_count * l_nev * size_of_real_datatype , &
! cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev * &
size_of_real_datatype ,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"unpack_row_group_real_gpu: error in cudaMemcpy"
stop
endif
!write(*,*) cudaGetErrorString(istat)
! Use one kernel call to pack the entire row group
! call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)
call launch_my_unpack_c_kernel_real( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,a_dev)
end subroutine
! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
! occurs when the queue is full or when the next row belongs to another group
subroutine unpack_and_prepare_row_group_real_gpu(next_unpack_idx, force)
use precision
implicit none
integer(kind=ik), intent(in) :: next_unpack_idx
logical, intent(in) :: force
if (row_group_size == 0) then
! Nothing to flush, just prepare for the upcoming row
row_group_size = 1
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then
! A flush and a reset must be performed
call unpack_row_group_real_gpu(row_group(:, :), unpack_idx - row_group_size, row_group_size)
row_group_size = 1
else
! Just prepare for the upcoming row
row_group_size = row_group_size + 1
endif
endif
! Always update the index for the upcoming row
unpack_idx = next_unpack_idx
end subroutine
! The host wrapper for computing the dot products between consecutive HH reflectors (see the kernel below)
subroutine compute_hh_dot_products_real_gpu(nbw, n)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
if (n .le. 1) return
call launch_compute_hh_dotp_c_kernel_real( bcast_buffer_dev, hh_dot_dev, nbw, n)
end subroutine
! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
subroutine extract_hh_tau_real_gpu(nbw, n, is_zero)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
logical, value :: is_zero
integer(kind=ik) :: val_is_zero
if (is_zero) then
val_is_zero = 1
else
val_is_zero = 0
endif
call launch_extract_hh_tau_c_kernel_real(bcast_buffer_dev,hh_tau_dev, nbw, n, val_is_zero)
end subroutine
! -------------------------------------------
! Fortran back-transformation support kernels
! -------------------------------------------
! Reset a reduction block
! Limitation: the thread-block size must be a divider of the reduction block's size
! Reset 2 reduction blocks without an explicit synchronization at the end
! Limitation: : the thread-block size must be a divider of the reduction block's size
! Perform a reduction on an initialized, 128-element shared block
! Compute the dot-product between 2 consecutive HH vectors
! Limitation 1: the size of the thread block must be at most 128 and a power-of-2
! Limitation 2: the size of the warp must be equal to 32
!
! Extract "tau" from the HH matrix and replace it with 1.0 or 0.0 (depending on case)
! Having "tau" as the first element in a HH reflector reduces space requirements, but causes undesired branching in the kernels
!
! -------------------------------------------
! Fortran back-transformation support kernels
! -------------------------------------------
!
! This is the simplest and slowest available backtransformation kernel
!
! This is an improved version of the simple backtransformation kernel; here, we halve the number of iterations and apply
! 2 Householder reflectors per iteration
!
! ---------------------------------
! Row packing and unpacking kernels
! ---------------------------------
!
! The row group packing kernel
! Host wrapper for the Householder backtransformation step. Several kernels are available. Performance note:
! - "compute_hh_trafo_c_kernel" is the C kernel for the backtransformation (this exhibits best performance)
! - "compute_hh_trafo_kernel" is the Fortran equivalent of the C kernel
! - "compute_hh_trafo_single_kernel" is the reference Fortran kernel
end subroutine trans_ev_tridi_to_band_real end subroutine trans_ev_tridi_to_band_real
...@@ -5187,7 +5036,7 @@ module ELPA2_compute ...@@ -5187,7 +5036,7 @@ module ELPA2_compute
! Q = Q - V * T**T * V**T * Q ! Q = Q - V * T**T * V**T * Q
if (l_rows>0) then if (l_rows > 0) then
if (useGPU) then if (useGPU) then
call cublas_zgemm('C','N',n_cols,l_cols,l_rows,CONE,hvm_dev,max_local_rows, & call cublas_zgemm('C','N',n_cols,l_cols,l_rows,CONE,hvm_dev,max_local_rows, &
q_dev,ldq,CZERO,tmp_dev,n_cols) q_dev,ldq,CZERO,tmp_dev,n_cols)
...@@ -5201,7 +5050,7 @@ module ELPA2_compute ...@@ -5201,7 +5050,7 @@ module ELPA2_compute
call zgemm('C','N',n_cols,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), & call zgemm('C','N',n_cols,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), &
q,ldq,CZERO,tmp1,n_cols) q,ldq,CZERO,tmp1,n_cols)
endif endif
else else ! l_rows > 0
if (useGPU) then if (useGPU) then
if (l_cols*n_cols .gt. (max_local_cols)*(nbw)) then if (l_cols*n_cols .gt. (max_local_cols)*(nbw)) then
print *,"trans_ev_band_to_full_complex: tmp_dev ",l_cols*n_cols,max_local_cols print *,"trans_ev_band_to_full_complex: tmp_dev ",l_cols*n_cols,max_local_cols
...@@ -5308,7 +5157,7 @@ module ELPA2_compute ...@@ -5308,7 +5157,7 @@ module ELPA2_compute
!if (istat .ne. 0) then !if (istat .ne. 0) then
!print *,"trans_ev_band_to_full_complex: error when deallocating tmat_temp "//errorMessage !print *,"trans_ev_band_to_full_complex: error when deallocating tmat_temp "//errorMessage
!endif !endif
endif endif ! use GPU
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("trans_ev_band_to_full_complex") call timer%stop("trans_ev_band_to_full_complex")
#endif #endif
...@@ -7775,7 +7624,12 @@ module ELPA2_compute ...@@ -7775,7 +7624,12 @@ module ELPA2_compute
enddo enddo
else else
do i = 1, min(na - num_blk*nblk, nblk) do i = 1, min(na - num_blk*nblk, nblk)
call pack_row_complex_cpu(row, j*nblk+i+a_off) #ifdef WITH_OPENMP
call pack_row_complex_cpu_openmp(a, row, j*nblk+i+a_off, &
stripe_width, stripe_count, max_threads, thread_width, l_nev)
#else
call pack_row_complex_cpu(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
#endif
q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:) q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
enddo enddo
endif endif
...@@ -7784,7 +7638,13 @@ module ELPA2_compute ...@@ -7784,7 +7638,13 @@ module ELPA2_compute
call pack_row_group_complex_gpu(result_buffer(:, :, nbuf), j * nblk + a_off, nblk) call pack_row_group_complex_gpu(result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
else else
do i = 1, nblk do i = 1, nblk
call pack_row_complex_cpu(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) #ifdef WITH_OPENMP
call pack_row_complex_cpu_openmp(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, &
stripe_width, stripe_count, max_threads, thread_width, l_nev)
#else
call pack_row_complex_cpu(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, &
last_stripe_width, stripe_count)
#endif
enddo enddo
endif endif
......
module pack_unpack_real_gpu
#include "config-f90.h"
implicit none
public pack_row_group_real_gpu, unpack_row_group_real_gpu, &
unpack_and_prepare_row_group_real_gpu, compute_hh_dot_products_real_gpu, &
extract_hh_tau_real_gpu
contains
! Pack a filled row group (i.e. an array of consecutive rows)
subroutine pack_row_group_real_gpu(row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
rows, n_offset, row_count)
use cuda_c_kernel
use cuda_functions
use precision
use iso_c_binding
implicit none
integer(kind=c_intptr_t) :: row_group_dev, a_dev
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
integer(kind=ik), intent(in) :: n_offset, row_count
real(kind=rk) :: rows(:,:)
integer(kind=ik) :: max_idx
logical :: successCUDA
! Use many blocks for higher GPU occupancy
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
! Use one kernel call to pack the entire row group
! call my_pack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, a_dev, row_group_dev)
call launch_my_pack_c_kernel_real(row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, &
l_nev, a_dev, row_group_dev)
! Issue one single transfer call for all rows (device to host)
! rows(:, 1 : row_count) = row_group_dev(:, 1 : row_count)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev , row_count * l_nev * size_of_real_datatype , &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"pack_row_group_real_gpu: error in cudaMemcpy"
stop
endif
!write(*,*) cudaGetErrorString(istat)
end subroutine
! Unpack a filled row group (i.e. an array of consecutive rows)
subroutine unpack_row_group_real_gpu(row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
a_dim2, l_nev, rows, n_offset, row_count)
use cuda_c_kernel
use precision
use iso_c_binding
use cuda_functions
implicit none
integer(kind=c_intptr_t) :: row_group_dev, a_dev
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
integer(kind=ik), intent(in) :: n_offset, row_count
real(kind=rk), intent(in) :: rows(:, :)
integer(kind=ik) :: max_idx
integer(kind=ik) :: i
logical :: successCUDA
! Use many blocks for higher GPU occupancy
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
! Issue one single transfer call for all rows (host to device)
! row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)
!istat = cuda_memcpy( row_group_dev , loc(rows(:, 1: row_count)),row_count * l_nev * size_of_real_datatype , &
! cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev * &
size_of_real_datatype ,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"unpack_row_group_real_gpu: error in cudaMemcpy"
stop
endif
!write(*,*) cudaGetErrorString(istat)
! Use one kernel call to pack the entire row group
! call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)
call launch_my_unpack_c_kernel_real( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,a_dev)
end subroutine
! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
! occurs when the queue is full or when the next row belongs to another group
subroutine unpack_and_prepare_row_group_real_gpu(row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, next_unpack_idx, force)
use iso_c_binding
use precision
implicit none
real(kind=rk) :: row_group(:,:)
integer(kind=c_intptr_t) :: row_group_dev, a_dev
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
integer(kind=ik), intent(inout) :: row_group_size
integer(kind=ik), intent(in) :: nblk
integer(kind=ik), intent(inout) :: unpack_idx
integer(kind=ik), intent(in) :: next_unpack_idx
logical, intent(in) :: force
if (row_group_size == 0) then
! Nothing to flush, just prepare for the upcoming row
row_group_size = 1
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then
! A flush and a reset must be performed
call unpack_row_group_real_gpu(row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
a_dim2, l_nev, row_group(:, :), unpack_idx - row_group_size, row_group_size)
row_group_size = 1
else
! Just prepare for the upcoming row
row_group_size = row_group_size + 1
endif
endif
! Always update the index for the upcoming row
unpack_idx = next_unpack_idx
end subroutine
! The host wrapper for computing the dot products between consecutive HH reflectors (see the kernel below)
subroutine compute_hh_dot_products_real_gpu(bcast_buffer_dev, hh_dot_dev, nbw, n)
use cuda_c_kernel
use precision
use iso_c_binding
implicit none
integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_dot_dev
integer(kind=ik), value :: nbw, n
if (n .le. 1) return
call launch_compute_hh_dotp_c_kernel_real( bcast_buffer_dev, hh_dot_dev, nbw, n)
end subroutine
! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
subroutine extract_hh_tau_real_gpu(bcast_buffer_dev, hh_tau_dev, nbw, n, is_zero)
use cuda_c_kernel
use precision
use iso_c_binding
implicit none
integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_tau_dev
integer(kind=ik), value :: nbw, n
logical, value :: is_zero
integer(kind=ik) :: val_is_zero
if (is_zero) then
val_is_zero = 1
else
val_is_zero = 0
endif
call launch_extract_hh_tau_c_kernel_real(bcast_buffer_dev, hh_tau_dev, nbw, n, val_is_zero)
end subroutine
! -------------------------------------------
! Fortran back-transformation support kernels
! -------------------------------------------
! Reset a reduction block
! Limitation: the thread-block size must be a divider of the reduction block's size
! Reset 2 reduction blocks without an explicit synchronization at the end
! Limitation: : the thread-block size must be a divider of the reduction block's size
! Perform a reduction on an initialized, 128-element shared block
! Compute the dot-product between 2 consecutive HH vectors
! Limitation 1: the size of the thread block must be at most 128 and a power-of-2
! Limitation 2: the size of the warp must be equal to 32
!
! Extract "tau" from the HH matrix and replace it with 1.0 or 0.0 (depending on case)
! Having "tau" as the first element in a HH reflector reduces space requirements, but causes undesired branching in the kernels
!
! -------------------------------------------
! Fortran back-transformation support kernels
! -------------------------------------------
!
! This is the simplest and slowest available backtransformation kernel
!
! This is an improved version of the simple backtransformation kernel; here, we halve the number of iterations and apply
! 2 Householder reflectors per iteration
!
! ---------------------------------
! Row packing and unpacking kernels
! ---------------------------------
!
! The row group packing kernel
! Host wrapper for the Householder backtransformation step. Several kernels are available. Performance note:
! - "compute_hh_trafo_c_kernel" is the C kernel for the backtransformation (this exhibits best performance)