Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
d2e2f73c
Commit
d2e2f73c
authored
Mar 02, 2020
by
Andreas Marek
Browse files
Modify test programs
parent
9a1e9b62
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
ci_test_scripts/generate_gitlab_ci_tests.py
View file @
d2e2f73c
...
...
@@ -661,8 +661,9 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
# add tests for scalapack for some specific test cases
runScalapackTest
=
False
if
(
instr
==
"avx2"
and
cov
==
"coverage"
and
m
==
"mpi"
):
runScalapackTest
=
True
#if (instr == "avx2" and cov == "coverage" and m == "mpi"):
#if (instr == "avx2" and m == "mpi"):
# runScalapackTest = True
# address-sanitize only with gnu compiler
...
...
@@ -790,7 +791,7 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
if
(
runScalapackTest
):
print
(
" - ./ci_test_scripts/run_ci_tests.sh -c
\"
CC=
\\\"
"
+
c_compiler_wrapper
+
"
\\\"
"
+
" CFLAGS=
\\\"
"
+
CFLAGS
+
"
\\\"
"
+
" FC=
\\\"
"
+
fortran_compiler_wrapper
+
"
\\\"
"
+
" FCFLAGS=
\\\"
"
+
FCFLAGS
+
"
\\\"
"
\
+
libs
+
" "
+
ldflags
+
" "
+
" "
+
scalapackldflags
+
" "
+
scalapackfcflags
\
+
" --enable-option-checking=fatal --enable-scalapack-tests"
+
" "
+
mpi_configure_flag
+
" "
+
openmp
[
o
]
\
+
" --enable-option-checking=fatal --enable-scalapack-tests
--enable-autotune-redistribute-matrix
"
+
" "
+
mpi_configure_flag
+
" "
+
openmp
[
o
]
\
+
" "
+
precision
[
p
]
+
" "
+
assumed_size
[
a
]
+
" "
+
band_to_full_blocking
[
b
]
\
+
" "
+
gpu
[
g
]
+
INSTRUCTION_OPTIONS
+
"
\"
-j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -q
\"
srun
\"
-S $SLURM -g "
+
gpuJob
)
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
d2e2f73c
...
...
@@ -57,7 +57,7 @@
#undef SAVE_MATR
#ifdef DOUBLE_PRECISION_REAL
#define SAVE_MATR(name, iteration) \
call
prmat
(
na
,
useGpu
,
a_mat
,
a_dev
,
lda
,
matrixCols
,
nblk
,
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
name
,
iteration
)
call
prmat
(
na
,
useGpu
,
a_mat
,
a_dev
,
matrixRows
,
matrixCols
,
nblk
,
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
name
,
iteration
)
#else
#define SAVE_MATR(name, iteration)
#endif
...
...
@@ -69,12 +69,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
!> \param obj object of elpa_type
!> \param na Order of matrix
!>
!> \param a_mat(
lda
,matrixCols) Distributed matrix which should be reduced.
!> \param a_mat(
matrixRows
,matrixCols) Distributed matrix which should be reduced.
!> Distribution is like in Scalapack.
!> Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!> a(:,:) is overwritten on exit with the Householder vectors
!>
!> \param
lda
Leading dimension of a
!> \param
matrixRows
Leading dimension of a
!>
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!>
...
...
@@ -96,7 +96,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
(
obj
,
na
,
a_mat
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
d_vec
,
e_vec
,
tau
,
useGPU
,
wantDebug
,
max_threads
)
(
obj
,
na
,
a_mat
,
matrixRows
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
d_vec
,
e_vec
,
tau
,
useGPU
,
wantDebug
,
max_threads
)
use
cuda_functions
use
iso_c_binding
use
precision
...
...
@@ -107,17 +107,17 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
implicit
none
#include "../general/precision_kinds.F90"
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
matrixRows
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
logical
,
intent
(
in
)
::
useGPU
,
wantDebug
integer
(
kind
=
c_int
)
::
skewsymmetric
logical
::
isSkewsymmetric
MATH_DATATYPE
(
kind
=
rck
),
intent
(
out
)
::
tau
(
na
)
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE
(
kind
=
rck
),
intent
(
inout
)
::
a_mat
(
lda
,
*
)
MATH_DATATYPE
(
kind
=
rck
),
intent
(
inout
)
::
a_mat
(
matrixRows
,
*
)
#else
MATH_DATATYPE
(
kind
=
rck
),
intent
(
inout
)
::
a_mat
(
lda
,
matrixCols
)
MATH_DATATYPE
(
kind
=
rck
),
intent
(
inout
)
::
a_mat
(
matrixRows
,
matrixCols
)
#endif
real
(
kind
=
rk
),
intent
(
out
)
::
d_vec
(
na
)
real
(
kind
=
rk
),
intent
(
out
)
::
e_vec
(
na
)
...
...
@@ -356,11 +356,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if
(
useGPU
)
then
! allocate memmory for matrix A on the device and than copy the matrix
successCUDA
=
cuda_malloc
(
a_dev
,
lda
*
matrixCols
*
size_of_datatype
)
successCUDA
=
cuda_malloc
(
a_dev
,
matrixRows
*
matrixCols
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: a_dev"
,
successCUDA
)
successCUDA
=
cuda_memcpy
(
a_dev
,
int
(
loc
(
a_mat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
lda
*
matrixCols
*
size_of_datatype
,
cudaMemcpyHostToDevice
)
matrixRows
*
matrixCols
*
size_of_datatype
,
cudaMemcpyHostToDevice
)
check_memcpy_cuda
(
"tridiag: a_dev"
,
successCUDA
)
endif
...
...
@@ -383,7 +383,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! copy l_cols + 1 column of A to v_row
if
(
useGPU
)
then
a_offset
=
l_cols
*
lda
*
size_of_datatype
a_offset
=
l_cols
*
matrixRows
*
size_of_datatype
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset,
! (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
...
...
@@ -399,6 +399,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#if COMPLEXCASE == 1
aux
(
1
:
2
*
n_stored_vecs
)
=
conjg
(
uv_stored_cols
(
l_cols
+1
,
1
:
2
*
n_stored_vecs
))
#endif
call
PRECISION_GEMV
(
'N'
,
&
int
(
l_rows
,
kind
=
BLAS_KIND
),
int
(
2
*
n_stored_vecs
,
kind
=
BLAS_KIND
),
&
ONE
,
vu_stored_rows
,
int
(
ubound
(
vu_stored_rows
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
...
...
@@ -543,19 +544,21 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_row
(
l_row_beg
),
1_BLAS_KIND
,
ONE
,
uc_p
(
l_col_beg
,
my_thread
),
1_BLAS_KIND
)
if
(
i
/
=
j
)
then
if
(
isSkewsymmetric
)
then
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
v_col
(
l_col_beg
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
&
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
v_col
(
l_col_beg
),
&
1_BLAS_KIND
,
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
else
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
v_col
(
l_col_beg
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
&
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
v_col
(
l_col_beg
),
&
1_BLAS_KIND
,
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
endif
endif
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
...
...
@@ -570,19 +573,19 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_row
(
l_row_beg
),
1_BLAS_KIND
,
&
ONE
,
u_col
(
l_col_beg
),
1_BLAS_KIND
)
if
(
i
/
=
j
)
then
if
(
isSkewsymmetric
)
then
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
),
1_BLAS_KIND
,
ONE
,
u_row
(
l_row_beg
),
1_BLAS_KIND
)
else
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
),
1_BLAS_KIND
,
ONE
,
u_row
(
l_row_beg
),
1_BLAS_KIND
)
endif
endif
...
...
@@ -594,20 +597,20 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
enddo
! i=0,(istep-2)/tile_size
if
(
useGPU
)
then
if
(
mat_vec_as_one_block
)
then
if
(
mat_vec_as_one_block
)
then
! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
! this requires altering of the algorithm when later explicitly updating the matrix
! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"cublas"
)
call
cublas_PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
l_rows
,
l_cols
,
&
ONE
,
a_dev
,
lda
,
&
ONE
,
a_dev
,
matrixRows
,
&
v_row_dev
,
1
,
&
ONE
,
u_col_dev
,
1
)
! todo: try with non transposed!!!
! if(i/=j) then
! call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
! ONE, a_dev + a_offset,
lda
, &
! ONE, a_dev + a_offset,
matrixRows
, &
! v_col_dev + (l_col_beg - 1) * &
! size_of_datatype, 1, &
! ONE, u_row_dev + (l_row_beg - 1) * &
...
...
@@ -615,7 +618,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! endif
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"cublas"
)
else
else
! mat_vec_as_one_block
!perform multiplication by stripes - it is faster than by blocks, since we call cublas with
!larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
do
i
=
0
,(
istep
-2
)/
tile_size
...
...
@@ -626,12 +629,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
l_row_beg
=
1
l_row_end
=
min
(
l_rows
,(
i
+1
)
*
l_rows_per_tile
)
a_offset
=
((
l_row_beg
-1
)
+
(
l_col_beg
-
1
)
*
lda
)
*
&
a_offset
=
((
l_row_beg
-1
)
+
(
l_col_beg
-
1
)
*
matrixRows
)
*
&
size_of_datatype
call
cublas_PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
l_row_end
-
l_row_beg
+1
,
l_col_end
-
l_col_beg
+1
,
&
ONE
,
a_dev
+
a_offset
,
lda
,
&
ONE
,
a_dev
+
a_offset
,
matrixRows
,
&
v_row_dev
+
(
l_row_beg
-
1
)
*
size_of_datatype
,
1
,
&
ONE
,
u_col_dev
+
(
l_col_beg
-
1
)
*
size_of_datatype
,
1
)
enddo
...
...
@@ -644,16 +647,16 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
l_row_beg
=
1
l_row_end
=
min
(
l_rows
,
i
*
l_rows_per_tile
)
a_offset
=
((
l_row_beg
-1
)
+
(
l_col_beg
-
1
)
*
lda
)
*
&
a_offset
=
((
l_row_beg
-1
)
+
(
l_col_beg
-
1
)
*
matrixRows
)
*
&
size_of_datatype
if
(
isSkewsymmetric
)
then
call
cublas_PRECISION_GEMV
(
'N'
,
l_row_end
-
l_row_beg
+1
,
l_col_end
-
l_col_beg
+1
,
&
-
ONE
,
a_dev
+
a_offset
,
lda
,
&
-
ONE
,
a_dev
+
a_offset
,
matrixRows
,
&
v_col_dev
+
(
l_col_beg
-
1
)
*
size_of_datatype
,
1
,
&
ONE
,
u_row_dev
+
(
l_row_beg
-
1
)
*
size_of_datatype
,
1
)
else
call
cublas_PRECISION_GEMV
(
'N'
,
l_row_end
-
l_row_beg
+1
,
l_col_end
-
l_col_beg
+1
,
&
ONE
,
a_dev
+
a_offset
,
lda
,
&
ONE
,
a_dev
+
a_offset
,
matrixRows
,
&
v_col_dev
+
(
l_col_beg
-
1
)
*
size_of_datatype
,
1
,
&
ONE
,
u_row_dev
+
(
l_row_beg
-
1
)
*
size_of_datatype
,
1
)
endif
...
...
@@ -667,14 +670,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA
=
cuda_memcpy
(
int
(
loc
(
u_row
(
1
)),
kind
=
c_intptr_t
),
&
u_row_dev
,
l_rows
*
size_of_datatype
,
cudaMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: u_row_dev 1"
,
successCUDA
)
endif
! call PRECISION_SYMV('U', l_cols, &
! 1.d0, a_mat, ubound(a_mat,1), &
! v_row, 1, &
! 0.d0, u_col, 1)
! endif ! useGPU
endif
! useGPU
#ifdef WITH_OPENMP
!$OMP END PARALLEL
...
...
@@ -830,8 +826,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
size_of_datatype
,
&
max_local_rows
,
uv_stored_cols_dev
+
(
l_col_beg
-
1
)
*
&
size_of_datatype
,
&
max_local_cols
,
ONE
,
a_dev
+
((
l_row_beg
-
1
)
+
(
l_col_beg
-
1
)
*
lda
)
*
&
size_of_datatype
,
lda
)
max_local_cols
,
ONE
,
a_dev
+
((
l_row_beg
-
1
)
+
(
l_col_beg
-
1
)
*
matrixRows
)
*
&
size_of_datatype
,
matrixRows
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"cublas"
)
endif
else
!useGPU
...
...
@@ -841,7 +837,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
int
(
2
*
n_stored_vecs
,
kind
=
BLAS_KIND
),
&
ONE
,
vu_stored_rows
(
l_row_beg
,
1
),
int
(
ubound
(
vu_stored_rows
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
uv_stored_cols
(
l_col_beg
,
1
),
int
(
ubound
(
uv_stored_cols
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
))
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
))
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
endif
!useGPU
enddo
...
...
@@ -854,7 +850,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
call
cublas_PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
l_rows
,
l_cols
,
2
*
n_stored_vecs
,
&
ONE
,
vu_stored_rows_dev
,
max_local_rows
,
&
uv_stored_cols_dev
,
max_local_cols
,
&
ONE
,
a_dev
,
lda
)
ONE
,
a_dev
,
matrixRows
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"cublas"
)
endif
endif
...
...
@@ -865,7 +861,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if
(
my_prow
==
prow
(
istep
-1
,
nblk
,
np_rows
)
.and.
my_pcol
==
pcol
(
istep
-1
,
nblk
,
np_cols
))
then
if
(
useGPU
)
then
!a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
a_offset
=
((
l_rows
-
1
)
+
lda
*
(
l_cols
-
1
))
*
size_of_datatype
a_offset
=
((
l_rows
-
1
)
+
matrixRows
*
(
l_cols
-
1
))
*
size_of_datatype
successCUDA
=
cuda_memcpy
(
int
(
loc
(
a_mat
(
l_rows
,
l_cols
)),
kind
=
c_intptr_t
),
a_dev
+
a_offset
,
&
1
*
size_of_datatype
,
cudaMemcpyDeviceToHost
)
...
...
@@ -907,7 +903,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if
(
my_prow
==
prow
(
1
,
nblk
,
np_rows
))
then
! We use last l_cols value of loop above
if
(
useGPU
)
then
successCUDA
=
cuda_memcpy
(
int
(
loc
(
aux3
(
1
)),
kind
=
c_intptr_t
),
a_dev
+
(
lda
*
(
l_cols
-
1
))
*
size_of_datatype
,
&
successCUDA
=
cuda_memcpy
(
int
(
loc
(
aux3
(
1
)),
kind
=
c_intptr_t
),
a_dev
+
(
matrixRows
*
(
l_cols
-
1
))
*
size_of_datatype
,
&
1
*
size_of_datatype
,
cudaMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: a_dev 5"
,
successCUDA
)
vrl
=
aux3
(
1
)
...
...
@@ -961,7 +957,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if
(
my_prow
==
prow
(
1
,
nblk
,
np_rows
)
.and.
my_pcol
==
pcol
(
2
,
nblk
,
np_cols
))
then
if
(
useGPU
)
then
successCUDA
=
cuda_memcpy
(
int
(
loc
(
e_vec
(
1
)),
kind
=
c_intptr_t
),
a_dev
+
(
lda
*
(
l_cols
-
1
))
*
size_of_datatype
,
&
successCUDA
=
cuda_memcpy
(
int
(
loc
(
e_vec
(
1
)),
kind
=
c_intptr_t
),
a_dev
+
(
matrixRows
*
(
l_cols
-
1
))
*
size_of_datatype
,
&
1
*
size_of_datatype
,
cudaMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: a_dev 7"
,
successCUDA
)
else
!useGPU
...
...
test/Fortran/test.F90
View file @
d2e2f73c
...
...
@@ -574,6 +574,12 @@ program test
call
e
%
set
(
"nblk"
,
int
(
nblk
,
kind
=
c_int
),
error_elpa
)
assert_elpa_ok
(
error_elpa
)
if
(
layout
.eq.
'C'
)
then
call
e
%
set
(
"matrix_order"
,
COLUMN_MAJOR_ORDER
)
else
call
e
%
set
(
"matrix_order"
,
ROW_MAJOR_ORDER
)
endif
#ifdef WITH_MPI
#ifdef SPLIT_COMM_MYSELF
call
mpi_comm_split
(
MPI_COMM_WORLD
,
int
(
my_pcol
,
kind
=
MPI_KIND
),
int
(
my_prow
,
kind
=
MPI_KIND
),
&
...
...
test/Fortran/test_autotune.F90
View file @
d2e2f73c
...
...
@@ -209,6 +209,12 @@ program test
call
e
%
set
(
"nblk"
,
int
(
nblk
,
kind
=
c_int
),
error_elpa
)
assert_elpa_ok
(
error_elpa
)
if
(
layout
.eq.
'C'
)
then
call
e
%
set
(
"matrix_order"
,
COLUMN_MAJOR_ORDER
)
else
call
e
%
set
(
"matrix_order"
,
ROW_MAJOR_ORDER
)
endif
#ifdef WITH_MPI
call
e
%
set
(
"mpi_comm_parent"
,
int
(
MPI_COMM_WORLD
,
kind
=
c_int
),
error_elpa
)
assert_elpa_ok
(
error_elpa
)
...
...
@@ -230,7 +236,7 @@ program test
if
(
myid
==
0
)
print
*
,
""
tune_state
=>
e
%
autotune_setup
(
ELPA_AUTOTUNE_
MEDIUM
,
AUTOTUNE_DOMAIN
,
error_elpa
)
tune_state
=>
e
%
autotune_setup
(
ELPA_AUTOTUNE_
FAST
,
AUTOTUNE_DOMAIN
,
error_elpa
)
assert_elpa_ok
(
error_elpa
)
iter
=
0
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment