Commit 8a07acb9 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'ELPA_GPU_pinned' into 'matrix_redistribute'

Elpa gpu pinned

See merge request !28
parents b380ce92 d13bdb79
......@@ -2,6 +2,11 @@ Changelog for upcoming release
- not yet decided
Changelog for ELPA 2020.05.001
- improved documentation, including fixing of typos and errors in markdown
- Fix a bug in the calling of Cannons algorithm which might lead to crashes
for a squared process grid
Changelog for ELPA 2019.11.001
- solve a bug when using parallel make builds
......
......@@ -62,7 +62,10 @@
#define check_memcpy_cuda(file, success) call check_memcpy_CUDA_f(file, __LINE__, success)
#define check_alloc_cuda(file, success) call check_alloc_CUDA_f(file, __LINE__, success)
#define check_dealloc_cuda(file, success) call check_dealloc_CUDA_f(file, __LINE__, success)
#define check_host_register_cuda(file, success) call check_host_register_CUDA_f(file, __LINE__, success)
#define check_host_unregister_cuda(file, success) call check_host_unregister_CUDA_f(file, __LINE__, success)
#define check_host_alloc_cuda(file, success) call check_host_alloc_CUDA_f(file, __LINE__, success)
#define check_host_dealloc_cuda(file, success) call check_host_dealloc_CUDA_f(file, __LINE__, success)
#endif
#if REALCASE == 1
......
......@@ -115,6 +115,7 @@
character(200) :: errorMessage
integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
integer(kind=c_intptr_t) :: num
integer(kind=C_intptr_T) :: qtmp1_dev, qtmp2_dev, ev_dev
logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
......@@ -629,13 +630,29 @@
qtmp2 = 0 ! Not really needed
if (useGPU) then
successCUDA = cuda_malloc(qtmp1_dev, gemm_dim_k * gemm_dim_l * size_of_datatype)
num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
successCUDA = cuda_host_register(int(loc(qtmp1),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("merge_systems: qtmp1", successCUDA)
successCUDA = cuda_malloc(qtmp1_dev, num)
check_alloc_cuda("merge_systems: qtmp1_dev", successCUDA)
successCUDA = cuda_malloc(ev_dev, gemm_dim_l * gemm_dim_m * size_of_datatype)
num = (gemm_dim_l * gemm_dim_m) * size_of_datatype
successCUDA = cuda_host_register(int(loc(ev),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("merge_systems: ev", successCUDA)
successCUDA = cuda_malloc(ev_dev, num)
check_alloc_cuda("merge_systems: ev_dev", successCUDA)
successCUDA = cuda_malloc(qtmp2_dev, gemm_dim_k * gemm_dim_m * size_of_datatype)
num = (gemm_dim_k * gemm_dim_m) * size_of_datatype
successCUDA = cuda_host_register(int(loc(qtmp2),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("merge_systems: qtmp2", successCUDA)
successCUDA = cuda_malloc(qtmp2_dev, num)
check_alloc_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
......@@ -860,21 +877,31 @@
enddo !ns = 0, nqcols1-1, max_strip ! strimining loop
enddo !do np = 1, npc_n
deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"merge_systems: error when deallocating ev "//errorMessage
stop 1
endif
if(useGPU) then
successCUDA = cuda_host_unregister(int(loc(qtmp1),kind=c_intptr_t))
check_host_unregister_cuda("merge_systems: qtmp1", successCUDA)
successCUDA = cuda_free(qtmp1_dev)
check_dealloc_cuda("merge_systems: qtmp1_dev", successCUDA)
successCUDA = cuda_host_unregister(int(loc(qtmp2),kind=c_intptr_t))
check_host_unregister_cuda("merge_systems: qtmp2", successCUDA)
successCUDA = cuda_free(qtmp2_dev)
check_dealloc_cuda("merge_systems: qtmp2_dev", successCUDA)
successCUDA = cuda_host_unregister(int(loc(ev),kind=c_intptr_t))
check_host_unregister_cuda("merge_systems: ev", successCUDA)
successCUDA = cuda_free(ev_dev)
check_dealloc_cuda("merge_systems: ev_dev", successCUDA)
endif
deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"merge_systems: error when deallocating ev "//errorMessage
stop 1
endif
endif !very outer test (na1==1 .or. na1==2)
#ifdef WITH_OPENMP
deallocate(z_p, stat=istat, errmsg=errorMessage)
......
......@@ -282,7 +282,7 @@ function elpa_solve_evp_&
call obj%get("gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
print *,"Problem getting option for gpu. Aborting..."
stop
endif
if (gpu .eq. 1) then
......@@ -293,7 +293,7 @@ function elpa_solve_evp_&
call obj%get("is_skewsymmetric",skewsymmetric,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
print *,"Problem getting option for skewsymmetric. Aborting..."
stop
endif
......@@ -317,7 +317,7 @@ function elpa_solve_evp_&
call obj%get("debug", debug,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option. Aborting..."
print *,"Problem setting option for debug. Aborting..."
stop
endif
wantDebug = debug == 1
......@@ -354,21 +354,21 @@ function elpa_solve_evp_&
if(do_useGPU) then
call obj%get("gpu_tridiag", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
print *,"Problem getting option for gpu_tridiag. Aborting..."
stop
endif
do_useGPU_tridiag = (gpu == 1)
call obj%get("gpu_solve_tridi", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
print *,"Problem getting option for gpu_solve_tridi. Aborting..."
stop
endif
do_useGPU_solve_tridi = (gpu == 1)
call obj%get("gpu_trans_ev", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
print *,"Problem getting option for gpu_trans_ev. Aborting..."
stop
endif
do_useGPU_trans_ev = (gpu == 1)
......@@ -480,7 +480,7 @@ function elpa_solve_evp_&
else
call obj%get("check_pd",check_pd,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option. Aborting..."
print *,"Problem setting option for check_pd. Aborting..."
stop
endif
if (check_pd .eq. 1) then
......
......@@ -123,14 +123,18 @@
integer(kind=ik) :: hvn_ubnd, hvm_ubnd
MATH_DATATYPE(kind=rck), allocatable :: hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:)
MATH_DATATYPE(kind=rck), pointer :: tmp1(:), tmp2(:)
MATH_DATATYPE(kind=rck), allocatable :: h1(:), h2(:)
MATH_DATATYPE(kind=rck), allocatable :: tmat(:,:), hvm1(:)
MATH_DATATYPE(kind=rck), pointer :: tmat(:,:)
MATH_DATATYPE(kind=rck), pointer :: hvm1(:)
type(c_ptr) :: tmp1_host, tmp2_host
type(c_ptr) :: hvm1_host, tmat_host
integer(kind=ik) :: istat
character(200) :: errorMessage
character(20) :: gpuString
integer(kind=c_intptr_t) :: num
integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev
logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
......@@ -173,10 +177,22 @@
max_stored_rows = (max_stored_rows_fac/nblk+1)*nblk
allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmat", istat, errorMessage)
if (.not.(useGPU)) then
allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmat", istat, errorMessage)
allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmp1", istat, errorMessage)
allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmp2", istat, errorMessage)
endif
allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_&
......@@ -188,16 +204,6 @@
&MATH_DATATYPE&
&", "h2", istat, errorMessage)
allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmp1", istat, errorMessage)
allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmp2", istat, errorMessage)
allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
......@@ -227,10 +233,29 @@
if (useGPU) then
! todo: this is used only for copying hmv to device.. it should be possible to go without it
allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "hvm1", istat, errorMessage)
!allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
!call check_alloc("trans_ev_&
!&MATH_DATATYPE&
!&", "hvm1", istat, errorMessage)
num = (max_local_rows*max_stored_rows) * size_of_datatype
successCUDA = cuda_malloc_host(hvm1_host,num)
check_alloc_cuda("trans_ev: hvm1_host", successCUDA)
call c_f_pointer(hvm1_host,hvm1,(/num/))
num = (max_stored_rows*max_stored_rows) * size_of_datatype
successCUDA = cuda_malloc_host(tmat_host,num)
check_alloc_cuda("trans_ev: tmat_host", successCUDA)
call c_f_pointer(tmat_host,tmat,(/max_stored_rows,max_stored_rows/))
num = (max_local_cols*max_stored_rows) * size_of_datatype
successCUDA = cuda_malloc_host(tmp1_host,num)
check_alloc_cuda("trans_ev: tmp1_host", successCUDA)
call c_f_pointer(tmp1_host,tmp1,(/num/))
num = (max_local_cols*max_stored_rows) * size_of_datatype
successCUDA = cuda_malloc_host(tmp2_host,num)
check_alloc_cuda("trans_ev: tmp2_host", successCUDA)
call c_f_pointer(tmp2_host,tmp2,(/num/))
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_datatype)
check_alloc_cuda("trans_ev", successCUDA)
......@@ -241,12 +266,16 @@
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_datatype)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_datatype)
num = ldq * matrixCols * size_of_datatype
successCUDA = cuda_malloc(q_dev, num)
check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q_mat
successCUDA = cuda_host_register(int(loc(q_mat),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("trans_ev: q_mat", successCUDA)
successCUDA = cuda_memcpy(q_dev, int(loc(q_mat(1,1)),kind=c_intptr_t), &
ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
num, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -458,7 +487,7 @@
enddo ! istep=1,na,nblk
deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
deallocate(h1, h2, hvb, hvm, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_&
&MATH_DATATYPE&
......@@ -472,13 +501,32 @@
q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_&
&MATH_DATATYPE&
&: error when deallocating hvm1 "//errorMessage
stop 1
endif
successCUDA = cuda_host_unregister(int(loc(q_mat),kind=c_intptr_t))
check_host_unregister_cuda("trans_ev: q_mat", successCUDA)
successCUDA = cuda_free_host(hvm1_host)
check_host_dealloc_cuda("trans_ev: hvm1_host", successCUDA)
nullify(hvm1)
successCUDA = cuda_free_host(tmat_host)
check_host_dealloc_cuda("trans_ev: tmat_host", successCUDA)
nullify(tmat)
successCUDA = cuda_free_host(tmp1_host)
check_host_dealloc_cuda("trans_ev: tmp1_host", successCUDA)
nullify(tmp1)
successCUDA = cuda_free_host(tmp2_host)
check_host_dealloc_cuda("trans_ev: tmp2_host", successCUDA)
nullify(tmp2)
!deallocate(hvm1, stat=istat, errmsg=errorMessage)
!if (istat .ne. 0) then
! print *,"trans_ev_&
! &MATH_DATATYPE&
! &: error when deallocating hvm1 "//errorMessage
! stop 1
!endif
!deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
successCUDA = cuda_free(q_dev)
......@@ -492,9 +540,17 @@
successCUDA = cuda_free(tmat_dev)
check_dealloc_cuda("trans_ev", successCUDA)
else
deallocate(tmat, tmp1, tmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_&
&MATH_DATATYPE&
&: error when deallocating hvm "//errorMessage
stop 1
endif
endif
call obj%timer%stop("trans_ev_&
&MATH_DATATYPE&
&" // &
......
This diff is collapsed.
......@@ -100,18 +100,18 @@
call obj%get("mpi_comm_rows",mpi_comm_rows,error )
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
print *,"Problem getting option for mpi_comm_rows. Aborting..."
stop
endif
call obj%get("mpi_comm_cols",mpi_comm_cols,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
print *,"Problem getting option for mpi_comm_cols. Aborting..."
stop
endif
call obj%get("debug",debug,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
print *,"Problem getting option for debug settings. Aborting..."
stop
endif
if (debug == 1) then
......
......@@ -93,18 +93,18 @@
call obj%get("mpi_comm_rows",mpi_comm_rows,error)
if (error .ne. ELPA_OK) then
print *,"Error getting option. Aborting..."
print *,"Error getting option for mpi_comm_rows. Aborting..."
stop
endif
call obj%get("mpi_comm_cols",mpi_comm_cols,error)
if (error .ne. ELPA_OK) then
print *,"Error getting option. Aborting..."
print *,"Error getting option for mpi_comm_cols. Aborting..."
stop
endif
call obj%get("debug", debug,error)
if (error .ne. ELPA_OK) then
print *,"Error getting option. Aborting..."
print *,"Error getting option for debug. Aborting..."
stop
endif
if (debug == 1) then
......
......@@ -52,6 +52,15 @@
! Author: A. Marek, MPCDF
!cannot use __FILE__ because filename with path can be too long for gfortran (max line length)
#define check_memcpy_cuda(file, success) call check_memcpy_CUDA_f(file, __LINE__, success)
#define check_alloc_cuda(file, success) call check_alloc_CUDA_f(file, __LINE__, success)
#define check_dealloc_cuda(file, success) call check_dealloc_CUDA_f(file, __LINE__, success)
#define check_host_register_cuda(file, success) call check_host_register_CUDA_f(file, __LINE__, success)
#define check_host_unregister_cuda(file, success) call check_host_unregister_CUDA_f(file, __LINE__, success)
#define check_host_alloc_cuda(file, success) call check_host_alloc_CUDA_f(file, __LINE__, success)
#define check_host_dealloc_cuda(file, success) call check_host_dealloc_CUDA_f(file, __LINE__, success)
#include "../general/sanity.F90"
use elpa1_compute
......@@ -111,7 +120,7 @@
! GPU settings
call obj%get("gpu", gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
print *,"Problem getting option for gpu. Aborting..."
stop
endif
......@@ -136,17 +145,17 @@
call obj%get("mpi_comm_rows",mpi_comm_rows,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
print *,"Problem getting option for mpi_comm_rows. Aborting..."
stop
endif
call obj%get("mpi_comm_cols",mpi_comm_cols,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
print *,"Problem getting option for mpi_comm_cols. Aborting..."
stop
endif
call obj%get("mpi_comm_parent",mpi_comm_all,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
print *,"Problem getting option for mpi_comm_parent. Aborting..."
stop
endif
......@@ -193,67 +202,34 @@
! copy b to b_dev
num = ldb*ldbCols*size_of_datatype
successCUDA = cuda_malloc(b_dev,num)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMalloc b_dev"
stop
endif
check_alloc_cuda("elpa_mult_at_b: b_dev", successCUDA)
successCUDA = cuda_host_register(int(loc(b),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaHostRegister b"
stop
endif
check_host_register_cuda("elpa_mult_at_b: b", successCUDA)
successCUDA = cuda_memcpy(b_dev,int(loc(b),kind=c_intptr_t),num,&
cudaMemcpyHostToDevice)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMemcpy, b H2D"
endif
check_memcpy_cuda("elpa_mult_at_b: b to b_dev", successCUDA)
num = l_rows*nblk_mult*size_of_datatype
successCUDA = cuda_malloc_host(aux_host,num)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMallocHost aux"
stop
endif
check_host_alloc_cuda("elpa_mult_at_b: aux_host", successCUDA)
call c_f_pointer(aux_host,aux_mat,(/l_rows,nblk_mult/))
successCUDA = cuda_malloc(aux_dev,num)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMalloc aux_dev"
stop
endif
check_alloc_cuda("elpa_mult_at_b: aux_dev", successCUDA)
num = nblk_mult*l_cols*size_of_datatype
successCUDA = cuda_malloc_host(tmp1_host,num)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMallocHost tmp1_host"
stop
endif
check_host_alloc_cuda("elpa_mult_at_b: tmp1_host", successCUDA)
call c_f_pointer(tmp1_host,tmp1,(/nblk_mult,l_cols/))
successCUDA = cuda_malloc(tmp1_dev,num)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMalloc tmp1_dev"
stop
endif
check_alloc_cuda("elpa_mult_at_b: tmp1_dev", successCUDA)
else ! useGPU
allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -265,28 +241,16 @@
endif ! useGPU
allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error when allocating aux_bc "//errorMessage
stop
endif
call check_alloc("elpa_mult_at_b_&
&MATH_DATATYPE ", "aux_bc", istat, errorMessage)
allocate(lrs_save(nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error when allocating lrs_save "//errorMessage
stop
endif
call check_alloc("elpa_mult_at_b_&
&MATH_DATATYPE ", "lrs_save", istat, errorMessage)
allocate(lre_save(nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error when allocating lre_save "//errorMessage
stop
endif
call check_alloc("elpa_mult_at_b_&
&MATH_DATATYPE ", "lre_save", istat, errorMessage)
a_lower = .false.
a_upper = .false.
......@@ -393,24 +357,15 @@
if (lcs<=lce) then
allocate(tmp1(nstor,lcs:lce), tmp2(nstor,lcs:lce), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error when allocating tmp1 "//errorMessage
stop
endif
call check_alloc("elpa_mult_at_b_&
&MATH_DATATYPE ", "tmp1", istat, errorMessage)
if (lrs<=lre) then
if (useGPU) then
num = l_rows*nblk_mult*size_of_datatype
successCUDA = cuda_memcpy(aux_dev, int(loc(aux_mat),kind=c_intptr_t), &
num, cudaMemcpyHostToDevice)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMemcpy aux_mat H2D"
stop
endif
check_memcpy_cuda("elpa_mult_at_b: aux_mat to aux_dev", successCUDA)
aux_off = (lrs-1)*size_of_datatype
b_off = ((lcs-1)*ldb+lrs-1)*size_of_datatype
......@@ -424,12 +379,7 @@
num = nstor*(lce-lcs+1)*size_of_datatype
successCUDA = cuda_memcpy(int(loc(tmp1),kind=c_intptr_t), &
tmp1_dev, num, cudaMemcpyDeviceToHost)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMemcpy tmp1 D2H"
stop
endif
check_memcpy_cuda("elpa_mult_at_b: tmp1_dev to tmp1", successCUDA)
else ! useGPU
call obj%timer%start("blas")
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', int(nstor,kind=BLAS_KIND), &
......@@ -478,55 +428,25 @@