Commit b380ce92 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'ELPA2_GPU' into matrix_redistribute

parents c505b751 7097a246
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -59,7 +59,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/general/mod_elpa_skewsymmetric_blas.F90 \
src/elpa_index.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa_c_interface.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa_c_interface.c
libelpa@SUFFIX@_private_la_SOURCES += \
......@@ -123,8 +123,7 @@ libelpa@SUFFIX@_private_la_SOURCES += \
endif
if WITH_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_c_v2.cu
EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES += src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_real.cu src/elpa2/GPU/ev_tridi_band_gpu_complex.cu
endif
if !WITH_MPI
......@@ -689,8 +688,6 @@ EXTRA_DIST = \
src/elpa1/elpa_reduce_add_vectors.F90 \
src/elpa1/elpa_solve_tridi_impl_public.F90 \
src/elpa1/elpa_transpose_vectors.F90 \
src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu \
src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu \
src/elpa2/compute_hh_trafo.F90 \
src/elpa2/elpa2_bandred_template.F90 \
src/elpa2/elpa2_compute_complex_template.F90 \
......
......@@ -777,9 +777,9 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
print("# " + cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" + a + "-" + b + "-" +g + "-" + cov + "-" + instr + "-" + addr)
print(cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" +a + "-" +b + "-" +g + "-" + cov + "-" + instr + "-" + addr + "-jobs:")
if (MasterOnly):
print(" only:")
print(" - /.*master.*/")
#if (MasterOnly):
# print(" only:")
# print(" - /.*master.*/")
if (instr == "power8"):
print(" allow_failure: true")
print(" tags:")
......
......@@ -137,11 +137,24 @@ if test x"$c11_standard" = x"no"; then
AX_CHECK_COMPILE_FLAG([-std=c11], [
c11_standard=yes
], [
echo "C compiler cannot compile -std=c11 code"
echo "testing -c11.."
])
if test x"$c11_standard" = x"yes"; then
CFLAGS+=" -std=c11"
fi
fi
if test x"$c11_standard" = x"no"; then
AX_CHECK_COMPILE_FLAG([-c11], [
c11_standard=yes
], [
echo "C compiler cannot compile -c11 code"
echo "C compiler cannot compile C11 code"
exit -1
])
if test x"$c11_standard" = x"yes"; then
CFLAGS+=" -std=c11"
CFLAGS+=" -c11"
fi
fi
......
This diff is collapsed.
......@@ -71,7 +71,7 @@
#ifdef WITH_GPU_VERSION
extern "C" {
int cublasCreateFromC(intptr_t *cublas_handle) {
// printf("in c: %p\n", *cublas_handle);
*cublas_handle = (intptr_t) malloc(sizeof(cublasHandle_t));
......@@ -83,8 +83,8 @@ extern "C" {
}
else if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
errormessage("Error in cublasCreate: %s\n", "the CUDA Runtime initialization failed");
return 0;
}
return 0;
}
else if (status == CUBLAS_STATUS_ALLOC_FAILED) {
errormessage("Error in cublasCreate: %s\n", "the resources could not be allocated");
return 0;
......@@ -92,7 +92,7 @@ extern "C" {
else{
errormessage("Error in cublasCreate: %s\n", "unknown error");
return 0;
}
}
}
int cublasDestroyFromC(intptr_t *cublas_handle) {
......@@ -104,28 +104,14 @@ extern "C" {
}
else if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
errormessage("Error in cublasDestroy: %s\n", "the library has not been initialized");
return 0;
}
return 0;
}
else{
errormessage("Error in cublasCreate: %s\n", "unknown error");
return 0;
}
}
int cudaThreadSynchronizeFromC() {
// cudaThreadSynchronize is deprecated
// cudaDeviceSynchronize should replace it
// it is currently not used in ELPA anyways
//cudaError_t cuerr = cudaThreadSynchronize();
cudaError_t cuerr = cudaDeviceSynchronize();
if (cuerr != cudaSuccess) {
errormessage("Error in cudaDeviceSynchronize: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaSetDeviceFromC(int n) {
cudaError_t cuerr = cudaSetDevice(n);
......@@ -182,6 +168,31 @@ extern "C" {
return 1;
}
int cudaMallocHostFromC(intptr_t *a, size_t width_height) {
cudaError_t cuerr = cudaMallocHost((void **) a, width_height);
#ifdef DEBUG_CUDA
printf("MallocHost pointer address: %p \n", *a);
#endif
if (cuerr != cudaSuccess) {
errormessage("Error in cudaMallocHost: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaFreeHostFromC(intptr_t *a) {
#ifdef DEBUG_CUDA
printf("FreeHost pointer address: %p \n", a);
#endif
cudaError_t cuerr = cudaFreeHost(a);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaFreeHost: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMemsetFromC(intptr_t *a, int value, size_t count) {
cudaError_t cuerr = cudaMemset( a, value, count);
......@@ -211,6 +222,27 @@ extern "C" {
}
return 1;
}
int cudaHostRegisterFromC(intptr_t *a, int value, int flag) {
cudaError_t cuerr = cudaHostRegister( a, value, flag);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaHostRegister: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaHostUnregisterFromC(intptr_t *a) {
cudaError_t cuerr = cudaHostUnregister( a);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaHostUnregister: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMemcpyDeviceToDeviceFromC(void) {
int val = cudaMemcpyDeviceToDevice;
return val;
......@@ -223,6 +255,10 @@ extern "C" {
int val = cudaMemcpyDeviceToHost;
return val;
}
int cudaHostRegisterDefaultFromC(void) {
int val = cudaHostRegisterDefault;
return val;
}
int cudaHostRegisterPortableFromC(void) {
int val = cudaHostRegisterPortable;
return val;
......@@ -231,7 +267,7 @@ extern "C" {
int val = cudaHostRegisterMapped;
return val;
}
cublasOperation_t operation_new_api(char trans) {
if (trans == 'N' || trans == 'n') {
return CUBLAS_OP_N;
......@@ -263,11 +299,11 @@ extern "C" {
return CUBLAS_FILL_MODE_LOWER;
}
}
cublasSideMode_t side_mode_new_api(char side) {
if (side == 'L' || side == 'l') {
return CUBLAS_SIDE_LEFT;
}
}
else if (side == 'R' || side == 'r') {
return CUBLAS_SIDE_RIGHT;
}
......@@ -277,7 +313,7 @@ extern "C" {
return CUBLAS_SIDE_LEFT;
}
}
cublasDiagType_t diag_type_new_api(char diag) {
if (diag == 'N' || diag == 'n') {
return CUBLAS_DIAG_NON_UNIT;
......@@ -291,62 +327,62 @@ extern "C" {
return CUBLAS_DIAG_NON_UNIT;
}
}
void cublasDgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, double alpha,
const double *A, int lda, const double *x, int incx,
double beta, double *y, int incy) {
cublasDgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha, A, lda, x, incx, &beta, y, incy);
double beta, double *y, int incy) {
cublasDgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha, A, lda, x, incx, &beta, y, incy);
}
void cublasSgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, float alpha,
const float *A, int lda, const float *x, int incx,
float beta, float *y, int incy) {
cublasSgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha, A, lda, x, incx, &beta, y, incy);
float beta, float *y, int incy) {
cublasSgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha, A, lda, x, incx, &beta, y, incy);
}
void cublasZgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, double _Complex alpha,
const double _Complex *A, int lda, const double _Complex *x, int incx,
double _Complex beta, double _Complex *y, int incy) {
double _Complex beta, double _Complex *y, int incy) {
cuDoubleComplex alpha_casted = *((cuDoubleComplex*)(&alpha));
cuDoubleComplex beta_casted = *((cuDoubleComplex*)(&beta));
const cuDoubleComplex* A_casted = (const cuDoubleComplex*) A;
const cuDoubleComplex* x_casted = (const cuDoubleComplex*) x;
cuDoubleComplex* y_casted = (cuDoubleComplex*) y;
cublasZgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha_casted, A_casted, lda, x_casted, incx, &beta_casted, y_casted, incy);
cublasZgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha_casted, A_casted, lda, x_casted, incx, &beta_casted, y_casted, incy);
}
void cublasCgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, float _Complex alpha,
const float _Complex *A, int lda, const float _Complex *x, int incx,
float _Complex beta, float _Complex *y, int incy) {
float _Complex beta, float _Complex *y, int incy) {
cuFloatComplex alpha_casted = *((cuFloatComplex*)(&alpha));
cuFloatComplex beta_casted = *((cuFloatComplex*)(&beta));
const cuFloatComplex* A_casted = (const cuFloatComplex*) A;
const cuFloatComplex* x_casted = (const cuFloatComplex*) x;
cuFloatComplex* y_casted = (cuFloatComplex*) y;
cublasCgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha_casted, A_casted, lda, x_casted, incx, &beta_casted, y_casted, incy);
cublasCgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha_casted, A_casted, lda, x_casted, incx, &beta_casted, y_casted, incy);
}
void cublasDgemm_elpa_wrapper (intptr_t handle, char transa, char transb, int m, int n, int k,
double alpha, const double *A, int lda,
const double *B, int ldb, double beta,
double *C, int ldc) {
cublasDgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
cublasDgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
......@@ -354,8 +390,8 @@ extern "C" {
float alpha, const float *A, int lda,
const float *B, int ldb, float beta,
float *C, int ldc) {
cublasSgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
cublasSgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
......@@ -363,15 +399,15 @@ extern "C" {
double _Complex alpha, const double _Complex *A, int lda,
const double _Complex *B, int ldb, double _Complex beta,
double _Complex *C, int ldc) {
cuDoubleComplex alpha_casted = *((cuDoubleComplex*)(&alpha));
cuDoubleComplex beta_casted = *((cuDoubleComplex*)(&beta));
const cuDoubleComplex* A_casted = (const cuDoubleComplex*) A;
const cuDoubleComplex* B_casted = (const cuDoubleComplex*) B;
cuDoubleComplex* C_casted = (cuDoubleComplex*) C;
cublasZgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
cublasZgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
m, n, k, &alpha_casted, A_casted, lda, B_casted, ldb, &beta_casted, C_casted, ldc);
}
......@@ -379,28 +415,28 @@ extern "C" {
float _Complex alpha, const float _Complex *A, int lda,
const float _Complex *B, int ldb, float _Complex beta,
float _Complex *C, int ldc) {
cuFloatComplex alpha_casted = *((cuFloatComplex*)(&alpha));
cuFloatComplex beta_casted = *((cuFloatComplex*)(&beta));
const cuFloatComplex* A_casted = (const cuFloatComplex*) A;
const cuFloatComplex* B_casted = (const cuFloatComplex*) B;
cuFloatComplex* C_casted = (cuFloatComplex*) C;
cublasCgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
cublasCgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
m, n, k, &alpha_casted, A_casted, lda, B_casted, ldb, &beta_casted, C_casted, ldc);
}
// todo: new CUBLAS API diverged from standard BLAS api for these functions
// todo: it provides out-of-place (and apparently more efficient) implementation
// todo: by passing B twice (in place of C as well), we should fall back to in-place algorithm
void cublasDtrmm_elpa_wrapper (intptr_t handle, char side, char uplo, char transa, char diag,
int m, int n, double alpha, const double *A,
int lda, double *B, int ldb){
cublasDtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
cublasDtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
diag_type_new_api(diag), m, n, &alpha, A, lda, B, ldb, B, ldb);
}
......@@ -408,7 +444,7 @@ extern "C" {
int m, int n, float alpha, const float *A,
int lda, float *B, int ldb){
cublasStrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
cublasStrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
diag_type_new_api(diag), m, n, &alpha, A, lda, B, ldb, B, ldb);
}
......@@ -417,11 +453,11 @@ extern "C" {
int lda, double _Complex *B, int ldb){
cuDoubleComplex alpha_casted = *((cuDoubleComplex*)(&alpha));
const cuDoubleComplex* A_casted = (const cuDoubleComplex*) A;
cuDoubleComplex* B_casted = (cuDoubleComplex*) B;
cublasZtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
cuDoubleComplex* B_casted = (cuDoubleComplex*) B;
cublasZtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
diag_type_new_api(diag), m, n, &alpha_casted, A_casted, lda, B_casted, ldb, B_casted, ldb);
}
......@@ -430,14 +466,14 @@ extern "C" {
int lda, float _Complex *B, int ldb){
cuFloatComplex alpha_casted = *((cuFloatComplex*)(&alpha));
const cuFloatComplex* A_casted = (const cuFloatComplex*) A;
cuFloatComplex* B_casted = (cuFloatComplex*) B;
cublasCtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
cuFloatComplex* B_casted = (cuFloatComplex*) B;
cublasCtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
diag_type_new_api(diag), m, n, &alpha_casted, A_casted, lda, B_casted, ldb, B_casted, ldb);
}
}
#endif /* WITH_GPU_VERSION */
......@@ -51,9 +51,10 @@ module cuda_functions
integer(kind=ik) :: cudaMemcpyHostToDevice
integer(kind=ik) :: cudaMemcpyDeviceToHost
integer(kind=ik) :: cudaMemcpyDeviceToDevice
integer(kind=ik) :: cudaHostRegisterDefault
integer(kind=ik) :: cudaHostRegisterPortable
integer(kind=ik) :: cudaHostRegisterMapped
integer(kind=ik) :: cudaMemcpyDeviceToDevice
! TODO global variable, has to be changed
integer(kind=C_intptr_T) :: cublasHandle = -1
......@@ -77,7 +78,7 @@ module cuda_functions
integer(kind=C_intptr_T) :: handle
integer(kind=C_INT) :: istat
end function cublas_create_c
end interface
end interface
interface
function cublas_destroy_c(handle) result(istat) &
......@@ -87,15 +88,6 @@ module cuda_functions
integer(kind=C_intptr_T) :: handle
integer(kind=C_INT) :: istat
end function cublas_destroy_c
end interface
interface
function cuda_threadsynchronize_c() result(istat) &
bind(C,name="cudaThreadSynchronizeFromC")
use iso_c_binding
implicit none
integer(kind=C_INT) :: istat
end function cuda_threadsynchronize_c
end interface
interface
......@@ -160,6 +152,15 @@ module cuda_functions
end function
end interface
interface
function cuda_hostRegisterDefault_c() result(flag) &
bind(C, name="cudaHostRegisterDefaultFromC")
use iso_c_binding
implicit none
integer(kind=c_int) :: flag
end function
end interface
interface
function cuda_hostRegisterPortable_c() result(flag) &
bind(C, name="cudaHostRegisterPortableFromC")
......@@ -214,6 +215,34 @@ module cuda_functions
end function cuda_memcpy2d_c
end interface
interface
function cuda_host_register_c(a, size, flag) result(istat) &
bind(C, name="cudaHostRegisterFromC")
use iso_c_binding
implicit none
integer(kind=C_intptr_t), value :: a
integer(kind=c_intptr_t), intent(in), value :: size
integer(kind=C_INT), intent(in), value :: flag
integer(kind=C_INT) :: istat
end function cuda_host_register_c
end interface
interface
function cuda_host_unregister_c(a) result(istat) &
bind(C, name="cudaHostUnregisterFromC")
use iso_c_binding
implicit none
integer(kind=C_intptr_t), value :: a
integer(kind=C_INT) :: istat
end function cuda_host_unregister_c
end interface
! functions to allocate and free CUDA memory
interface
......@@ -243,6 +272,33 @@ module cuda_functions
end function cuda_malloc_c
end interface
interface
function cuda_free_host_c(a) result(istat) &
bind(C, name="cudaFreeHostFromC")
use iso_c_binding
implicit none
type(c_ptr), value :: a
integer(kind=C_INT) :: istat
end function cuda_free_host_c
end interface
interface
function cuda_malloc_host_c(a, width_height) result(istat) &
bind(C, name="cudaMallocHostFromC")
use iso_c_binding
implicit none
type(c_ptr) :: a
integer(kind=c_intptr_t), intent(in), value :: width_height
integer(kind=C_INT) :: istat
end function cuda_malloc_host_c
end interface
interface
function cuda_memset_c(a, val, size) result(istat) &
bind(C, name="cudaMemsetFromC")
......@@ -290,7 +346,7 @@ module cuda_functions
real(kind=C_FLOAT),value :: alpha,beta
integer(kind=C_intptr_T), value :: a, b, c
integer(kind=C_intptr_T), value :: handle
end subroutine cublas_sgemm_c
end interface
......@@ -307,7 +363,7 @@ module cuda_functions
real(kind=C_DOUBLE), value :: alpha
integer(kind=C_intptr_T), value :: a, b
integer(kind=C_intptr_T), value :: handle
end subroutine cublas_dtrmm_c
end interface
......@@ -527,19 +583,6 @@ module cuda_functions
success = .true.
#endif
end function
function cuda_threadsynchronize() result(success)
use iso_c_binding
implicit none
logical :: success
#ifdef WITH_GPU_VERSION
success = cuda_threadsynchronize_c() /= 0
#else
success = .true.
#endif
end function cuda_threadsynchronize
function cuda_setdevice(n) result(success)
use iso_c_binding
......@@ -614,6 +657,35 @@ module cuda_functions
#endif
end function cuda_free
function cuda_malloc_host(a, width_height) result(success)
use iso_c_binding
implicit none
type(c_ptr) :: a
integer(kind=c_intptr_t), intent(in) :: width_height
logical :: success
#ifdef WITH_GPU_VERSION
success = cuda_malloc_host_c(a, width_height) /= 0
#else
success = .true.
#endif
end function
function cuda_free_host(a) result(success)
use iso_c_binding
implicit none
type(c_ptr), value :: a
logical :: success
#ifdef WITH_GPU_VERSION
success = cuda_free_host_c(a) /= 0
#else