Commit 0a63551f authored by Andreas Marek's avatar Andreas Marek
Browse files

GPU layer for multiply_a_b

parent 7ea680e7
......@@ -111,12 +111,23 @@
success = .true.
! GPU settings
call obj%get("nvidia-gpu", gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu. Aborting..."
stop
if (gpu_vendor() == NVIDIA_GPU) then
call obj%get("nvidia-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for NVIDIA GPU. Aborting..."
stop
endif
else if (gpu_vendor() == AMD_GPU) then
call obj%get("amd-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for AMD GPU. Aborting..."
stop
endif
else
gpu = 0
endif
useGPU = (gpu == 1)
if(useGPU) then
......@@ -180,11 +191,7 @@
call obj%timer%start("check_for_gpu")
if (check_for_gpu(obj, myid, numGPU)) then
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
call set_gpu_parameters()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
......@@ -194,34 +201,34 @@
! copy b to b_dev
num = ldb*ldbCols*size_of_datatype
successCUDA = cuda_malloc(b_dev,num)
successCUDA = gpu_malloc(b_dev,num)
check_alloc_cuda("elpa_mult_at_b: b_dev", successCUDA)
successCUDA = cuda_host_register(int(loc(b),kind=c_intptr_t),num,&
successCUDA = gpu_host_register(int(loc(b),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("elpa_mult_at_b: b", successCUDA)
successCUDA = cuda_memcpy(b_dev,int(loc(b),kind=c_intptr_t),num,&
successCUDA = gpu_memcpy(b_dev,int(loc(b),kind=c_intptr_t),num,&
cudaMemcpyHostToDevice)
check_memcpy_cuda("elpa_mult_at_b: b to b_dev", successCUDA)
num = l_rows*nblk_mult*size_of_datatype
successCUDA = cuda_malloc_host(aux_host,num)
successCUDA = gpu_malloc_host(aux_host,num)
check_host_alloc_cuda("elpa_mult_at_b: aux_host", successCUDA)
call c_f_pointer(aux_host,aux_mat,(/l_rows,nblk_mult/))
successCUDA = cuda_malloc(aux_dev,num)
successCUDA = gpu_malloc(aux_dev,num)
check_alloc_cuda("elpa_mult_at_b: aux_dev", successCUDA)
num = nblk_mult*l_cols*size_of_datatype
successCUDA = cuda_malloc_host(tmp1_host,num)
successCUDA = gpu_malloc_host(tmp1_host,num)
check_host_alloc_cuda("elpa_mult_at_b: tmp1_host", successCUDA)
call c_f_pointer(tmp1_host,tmp1,(/nblk_mult,l_cols/))
successCUDA = cuda_malloc(tmp1_dev,num)
successCUDA = gpu_malloc(tmp1_dev,num)
check_alloc_cuda("elpa_mult_at_b: tmp1_dev", successCUDA)
else ! useGPU
allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage)
......@@ -348,22 +355,22 @@
if (lrs<=lre) then
if (useGPU) then
num = l_rows*nblk_mult*size_of_datatype
successCUDA = cuda_memcpy(aux_dev, int(loc(aux_mat),kind=c_intptr_t), &
num, cudaMemcpyHostToDevice)
successCUDA = gpu_memcpy(aux_dev, int(loc(aux_mat),kind=c_intptr_t), &
num, gpuMemcpyHostToDevice)
check_memcpy_cuda("elpa_mult_at_b: aux_mat to aux_dev", successCUDA)
aux_off = (lrs-1)*size_of_datatype
b_off = ((lcs-1)*ldb+lrs-1)*size_of_datatype
call obj%timer%start("cublas")
call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', nstor, lce-lcs+1, &
call gpublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', nstor, lce-lcs+1, &
lre-lrs+1, ONE, aux_dev+aux_off, l_rows, b_dev+b_off, ldb, ZERO, &
tmp1_dev, nstor)
call obj%timer%stop("cublas")
num = nstor*(lce-lcs+1)*size_of_datatype
successCUDA = cuda_memcpy(int(loc(tmp1),kind=c_intptr_t), &
tmp1_dev, num, cudaMemcpyDeviceToHost)
successCUDA = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &
tmp1_dev, num, gpuMemcpyDeviceToHost)
check_memcpy_cuda("elpa_mult_at_b: tmp1_dev to tmp1", successCUDA)
else ! useGPU
call obj%timer%start("blas")
......@@ -406,25 +413,25 @@
enddo
if (useGPU) then
successCUDA = cuda_free(b_dev)
successCUDA = gpu_free(b_dev)
check_dealloc_cuda("elpa_multiply_a_b: b_dev", successCUDA)
successCUDA = cuda_host_unregister(int(loc(b),kind=c_intptr_t))
successCUDA = gpu_host_unregister(int(loc(b),kind=c_intptr_t))
check_host_unregister_cuda("elpa_multiply_a_b: b", successCUDA)
nullify(aux_mat)
nullify(tmp1)
successCUDA = cuda_free_host(aux_host)
successCUDA = gpu_free_host(aux_host)
check_host_dealloc_cuda("elpa_multiply_a_b: aux_host", successCUDA)
successCUDA = cuda_free(aux_dev)
successCUDA = gpu_free(aux_dev)
check_dealloc_cuda("elpa_multiply_a_b: aux_dev", successCUDA)
successCUDA = cuda_free_host(tmp1_host)
successCUDA = gpu_free_host(tmp1_host)
check_host_dealloc_cuda("elpa_multiply_a_b: tmp1_host", successCUDA)
successCUDA = cuda_free(tmp1_dev)
successCUDA = gpu_free(tmp1_dev)
check_dealloc_cuda("elpa_multiply_a_b: tmp1_dev", successCUDA)
else ! useGPU
deallocate(aux_mat, stat=istat, errmsg=errorMessage)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment