Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
15
Issues
15
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
69972fbf
Commit
69972fbf
authored
Aug 06, 2020
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Some cleanup
parent
03d44007
Changes
8
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
2629 additions
and
2424 deletions
+2629
-2424
configure.ac
configure.ac
+11
-2
m4/ax_check_gcc_version.m4
m4/ax_check_gcc_version.m4
+24
-0
m4/ax_compare_version.m4
m4/ax_compare_version.m4
+174
-0
src/elpa1/elpa1_merge_systems_real_template.F90
src/elpa1/elpa1_merge_systems_real_template.F90
+1005
-1005
src/elpa1/elpa1_solve_tridi_real_template.F90
src/elpa1/elpa1_solve_tridi_real_template.F90
+502
-502
src/elpa1/elpa1_tools_template.F90
src/elpa1/elpa1_tools_template.F90
+211
-211
src/elpa1/elpa1_trans_ev_template.F90
src/elpa1/elpa1_trans_ev_template.F90
+392
-392
src/elpa1/elpa1_tridiag_template.F90
src/elpa1/elpa1_tridiag_template.F90
+310
-312
No files found.
configure.ac
View file @
69972fbf
...
...
@@ -46,7 +46,7 @@ AC_DEFINE([EARLIEST_AUTOTUNE_VERSION], [20171201], [Earliest ELPA API version, w
AC_DEFINE([CURRENT_AUTOTUNE_VERSION], [20200417], [Current ELPA autotune version])
AC_DEFINE_SUBST(CURRENT_AUTOTUNE_VERSION, 20200417, "Current ELPA autotune version")
AC_DEFINE_UNQUOTED([ELPA_BUILDTIME], [$ELPA_BUILDTIME], ["Time of build"])
AX_COMPARE_VERSION([$ELPA_BUILDTIME], [gt], [1604905771],[old_elpa_version=yes],[old_elpa_version=no])
AX_CHECK_GNU_MAKE()
if test x$_cv_gnu_make_command = x ; then
...
...
@@ -1776,6 +1776,9 @@ else
echo "build config should be compiled into the library: no"
fi
if test x"$have_loop_blocking" = x"yes"; then
AC_DEFINE([LOOP_BLOCKING],[1],[use blocking in loops])
fi
AC_SUBST([SUFFIX])
AC_SUBST([PKG_CONFIG_FILE],[elpa${SUFFIX}-${PACKAGE_VERSION}.pc])
...
...
@@ -1986,4 +1989,10 @@ else
make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" CPP="$CPP"
fi
if test x"$old_elpa_version" = x"yes"; then
echo " "
echo " It is possible that your current version of ELPA is not the latest one."
echo " You might want to have a look at https://elpa.mpcdf.mpg.de, whether a more recent"
echo " version has been released already"
echo " "
fi
m4/ax_check_gcc_version.m4
0 → 100644
View file @
69972fbf
AC_DEFUN([AX_GCC_VERSION], [
GCC_VERSION=""
echo "calling gcc"
echo $CC
$CC | grep gcc
echo $?
AX_CHECK_COMPILE_FLAG([-dumpversion],
[ax_gcc_version_option=yes],
[ax_gcc_version_option=no])
AS_IF([test "x$GCC" = "xyes"],[
AS_IF([test "x$ax_gcc_version_option" != "xno"],[
AC_CACHE_CHECK([gcc version],[ax_cv_gcc_version],[
ax_cv_gcc_version="`$CC -dumpversion`"
AS_IF([test "x$ax_cv_gcc_version" = "x"],[
ax_cv_gcc_version=""
])
])
GCC_VERSION=$ax_cv_gcc_version
])
])
AC_SUBST([GCC_VERSION])
])
m4/ax_compare_version.m4
0 → 100644
View file @
69972fbf
# ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_compare_version.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_COMPARE_VERSION(VERSION_A, OP, VERSION_B, [ACTION-IF-TRUE], [ACTION-IF-FALSE])
#
# DESCRIPTION
#
# This macro compares two version strings. Due to the various number of
# minor-version numbers that can exist, and the fact that string
# comparisons are not compatible with numeric comparisons, this is not
# necessarily trivial to do in a autoconf script. This macro makes doing
# these comparisons easy.
#
# The six basic comparisons are available, as well as checking equality
# limited to a certain number of minor-version levels.
#
# The operator OP determines what type of comparison to do, and can be one
# of:
#
# eq - equal (test A == B)
# ne - not equal (test A != B)
# le - less than or equal (test A <= B)
# ge - greater than or equal (test A >= B)
# lt - less than (test A < B)
# gt - greater than (test A > B)
#
# Additionally, the eq and ne operator can have a number after it to limit
# the test to that number of minor versions.
#
# eq0 - equal up to the length of the shorter version
# ne0 - not equal up to the length of the shorter version
# eqN - equal up to N sub-version levels
# neN - not equal up to N sub-version levels
#
# When the condition is true, shell commands ACTION-IF-TRUE are run,
# otherwise shell commands ACTION-IF-FALSE are run. The environment
# variable 'ax_compare_version' is always set to either 'true' or 'false'
# as well.
#
# Examples:
#
# AX_COMPARE_VERSION([3.15.7],[lt],[3.15.8])
# AX_COMPARE_VERSION([3.15],[lt],[3.15.8])
#
# would both be true.
#
# AX_COMPARE_VERSION([3.15.7],[eq],[3.15.8])
# AX_COMPARE_VERSION([3.15],[gt],[3.15.8])
#
# would both be false.
#
# AX_COMPARE_VERSION([3.15.7],[eq2],[3.15.8])
#
# would be true because it is only comparing two minor versions.
#
# AX_COMPARE_VERSION([3.15.7],[eq0],[3.15])
#
# would be true because it is only comparing the lesser number of minor
# versions of the two values.
#
# Note: The characters that separate the version numbers do not matter. An
# empty string is the same as version 0. OP is evaluated by autoconf, not
# configure, so must be a string, not a variable.
#
# The author would like to acknowledge Guido Draheim whose advice about
# the m4_case and m4_ifvaln functions make this macro only include the
# portions necessary to perform the specific comparison specified by the
# OP argument in the final configure script.
#
# LICENSE
#
# Copyright (c) 2008 Tim Toolan <toolan@ele.uri.edu>
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 13
dnl #########################################################################
AC_DEFUN([AX_COMPARE_VERSION], [
AC_REQUIRE([AC_PROG_AWK])
# Used to indicate true or false condition
ax_compare_version=false
# Convert the two version strings to be compared into a format that
# allows a simple string comparison. The end result is that a version
# string of the form 1.12.5-r617 will be converted to the form
# 0001001200050617. In other words, each number is zero padded to four
# digits, and non digits are removed.
AS_VAR_PUSHDEF([A],[ax_compare_version_A])
A=`echo "$1" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \
-e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/[[^0-9]]//g'`
AS_VAR_PUSHDEF([B],[ax_compare_version_B])
B=`echo "$3" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \
-e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/[[^0-9]]//g'`
dnl # In the case of le, ge, lt, and gt, the strings are sorted as necessary
dnl # then the first line is used to determine if the condition is true.
dnl # The sed right after the echo is to remove any indented white space.
m4_case(m4_tolower($2),
[lt],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/false/;s/x${B}/true/;1q"`
],
[gt],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort | sed "s/x${A}/false/;s/x${B}/true/;1q"`
],
[le],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort | sed "s/x${A}/true/;s/x${B}/false/;1q"`
],
[ge],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/true/;s/x${B}/false/;1q"`
],[
dnl Split the operator from the subversion count if present.
m4_bmatch(m4_substr($2,2),
[0],[
# A count of zero means use the length of the shorter version.
# Determine the number of characters in A and B.
ax_compare_version_len_A=`echo "$A" | $AWK '{print(length)}'`
ax_compare_version_len_B=`echo "$B" | $AWK '{print(length)}'`
# Set A to no more than B's length and B to no more than A's length.
A=`echo "$A" | sed "s/\(.\{$ax_compare_version_len_B\}\).*/\1/"`
B=`echo "$B" | sed "s/\(.\{$ax_compare_version_len_A\}\).*/\1/"`
],
[[0-9]+],[
# A count greater than zero means use only that many subversions
A=`echo "$A" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"`
B=`echo "$B" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"`
],
[.+],[
AC_WARNING(
[invalid OP numeric parameter: $2])
],[])
# Pad zeros at end of numbers to make same length.
ax_compare_version_tmp_A="$A`echo $B | sed 's/./0/g'`"
B="$B`echo $A | sed 's/./0/g'`"
A="$ax_compare_version_tmp_A"
# Check for equality or inequality as necessary.
m4_case(m4_tolower(m4_substr($2,0,2)),
[eq],[
test "x$A" = "x$B" && ax_compare_version=true
],
[ne],[
test "x$A" != "x$B" && ax_compare_version=true
],[
AC_WARNING([invalid OP parameter: $2])
])
])
AS_VAR_POPDEF([A])dnl
AS_VAR_POPDEF([B])dnl
dnl # Execute ACTION-IF-TRUE / ACTION-IF-FALSE.
if test "$ax_compare_version" = "true" ; then
have_loop_blocking=yes
m4_ifvaln([$4],[$4],[:])dnl
m4_ifvaln([$5],[else $5])dnl
fi
]) dnl AX_COMPARE_VERSION
src/elpa1/elpa1_merge_systems_real_template.F90
View file @
69972fbf
...
...
@@ -54,8 +54,8 @@
#include "../general/sanity.F90"
subroutine
merge_systems_
&
&
PRECISION
&
subroutine
merge_systems_
&
&
PRECISION
&
(
obj
,
na
,
nm
,
d
,
e
,
q
,
ldq
,
nqoff
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
&
l_col
,
p_col
,
l_col_out
,
p_col_out
,
npc_0
,
npc_n
,
useGPU
,
wantDebug
,
success
,
max_threads
)
use
cuda_functions
...
...
src/elpa1/elpa1_solve_tridi_real_template.F90
View file @
69972fbf
...
...
@@ -338,12 +338,12 @@ subroutine solve_tridi_&
end
subroutine
merge_recursive_
&
&
PRECISION
end
subroutine
solve_tridi_
&
&
PRECISION_AND_SUFFIX
end
subroutine
solve_tridi_
&
&
PRECISION_AND_SUFFIX
subroutine
solve_tridi_col_
&
&
PRECISION_AND_SUFFIX
&
(
obj
,
na
,
nev
,
nqoff
,
d
,
e
,
q
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
useGPU
,
wantDebug
,
success
,
max_threads
)
subroutine
solve_tridi_col_
&
&
PRECISION_AND_SUFFIX
&
(
obj
,
na
,
nev
,
nqoff
,
d
,
e
,
q
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
useGPU
,
wantDebug
,
success
,
max_threads
)
! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
! with the divide and conquer method.
...
...
@@ -554,12 +554,12 @@ subroutine solve_tridi_&
call
obj
%
timer
%
stop
(
"solve_tridi_col"
//
PRECISION_SUFFIX
)
end
subroutine
solve_tridi_col_
&
&
PRECISION_AND_SUFFIX
end
subroutine
solve_tridi_col_
&
&
PRECISION_AND_SUFFIX
subroutine
solve_tridi_single_problem_
&
&
PRECISION_AND_SUFFIX
&
(
obj
,
nlen
,
d
,
e
,
q
,
ldq
,
wantDebug
,
success
)
subroutine
solve_tridi_single_problem_
&
&
PRECISION_AND_SUFFIX
&
(
obj
,
nlen
,
d
,
e
,
q
,
ldq
,
wantDebug
,
success
)
! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
...
...
@@ -673,6 +673,6 @@ subroutine solve_tridi_&
enddo
call
obj
%
timer
%
stop
(
"solve_tridi_single"
//
PRECISION_SUFFIX
)
end
subroutine
solve_tridi_single_problem_
&
&
PRECISION_AND_SUFFIX
end
subroutine
solve_tridi_single_problem_
&
&
PRECISION_AND_SUFFIX
src/elpa1/elpa1_tools_template.F90
View file @
69972fbf
...
...
@@ -56,9 +56,9 @@
#if REALCASE == 1
subroutine
v_add_s_
&
&
PRECISION
&
&(
obj
,
v
,
n
,
s
)
subroutine
v_add_s_
&
&
PRECISION
&
&(
obj
,
v
,
n
,
s
)
use
precision
use
elpa_abstract_impl
implicit
none
...
...
@@ -68,12 +68,12 @@
real
(
kind
=
rk
)
::
v
(
n
),
s
v
(:)
=
v
(:)
+
s
end
subroutine
v_add_s_
&
&
PRECISION
end
subroutine
v_add_s_
&
&
PRECISION
subroutine
distribute_global_column_
&
&
PRECISION
&
&(
obj
,
g_col
,
l_col
,
noff
,
nlen
,
my_prow
,
np_rows
,
nblk
)
subroutine
distribute_global_column_
&
&
PRECISION
&
&(
obj
,
g_col
,
l_col
,
noff
,
nlen
,
my_prow
,
np_rows
,
nblk
)
use
precision
use
elpa_abstract_impl
implicit
none
...
...
@@ -101,60 +101,60 @@
l_col
(
l_off
+
js
:
l_off
+
je
)
=
g_col
(
g_off
+
js
-
noff
:
g_off
+
je
-
noff
)
enddo
end
subroutine
distribute_global_column_
&
&
PRECISION
subroutine
solve_secular_equation_
&
&
PRECISION
&
&(
obj
,
n
,
i
,
d
,
z
,
delta
,
rho
,
dlam
)
!-------------------------------------------------------------------------------
! This routine solves the secular equation of a symmetric rank 1 modified
! diagonal matrix:
!
! 1. + rho*SUM(z(:)**2/(d(:)-x)) = 0
!
! It does the same as the LAPACK routine DLAED4 but it uses a bisection technique
! which is more robust (it always yields a solution) but also slower
! than the algorithm used in DLAED4.
!
! The same restictions than in DLAED4 hold, namely:
!
! rho > 0 and d(i+1) > d(i)
!
! but this routine will not terminate with error if these are not satisfied
! (it will normally converge to a pole in this case).
!
! The output in DELTA(j) is always (D(j) - lambda_I), even for the cases
! N=1 and N=2 which is not compatible with DLAED4.
! Thus this routine shouldn't be used for these cases as a simple replacement
! of DLAED4.
!
! The arguments are the same as in DLAED4 (with the exception of the INFO argument):
!
!
! N (input) INTEGER
! The length of all arrays.
!
! I (input) INTEGER
! The index of the eigenvalue to be computed. 1 <= I <= N.
!
! D (input) DOUBLE PRECISION array, dimension (N)
! The original eigenvalues. It is assumed that they are in
! order, D(I) < D(J) for I < J.
!
! Z (input) DOUBLE PRECISION array, dimension (N)
! The components of the updating Vector.
!
! DELTA (output) DOUBLE PRECISION array, dimension (N)
! DELTA contains (D(j) - lambda_I) in its j-th component.
! See remark above about DLAED4 compatibility!
!
! RHO (input) DOUBLE PRECISION
! The scalar in the symmetric updating formula.
!
! DLAM (output) DOUBLE PRECISION
! The computed lambda_I, the I-th updated eigenvalue.
!-------------------------------------------------------------------------------
end
subroutine
distribute_global_column_
&
&
PRECISION
subroutine
solve_secular_equation_
&
&
PRECISION
&
&(
obj
,
n
,
i
,
d
,
z
,
delta
,
rho
,
dlam
)
!-------------------------------------------------------------------------------
! This routine solves the secular equation of a symmetric rank 1 modified
! diagonal matrix:
!
! 1. + rho*SUM(z(:)**2/(d(:)-x)) = 0
!
! It does the same as the LAPACK routine DLAED4 but it uses a bisection technique
! which is more robust (it always yields a solution) but also slower
! than the algorithm used in DLAED4.
!
! The same restictions than in DLAED4 hold, namely:
!
! rho > 0 and d(i+1) > d(i)
!
! but this routine will not terminate with error if these are not satisfied
! (it will normally converge to a pole in this case).
!
! The output in DELTA(j) is always (D(j) - lambda_I), even for the cases
! N=1 and N=2 which is not compatible with DLAED4.
! Thus this routine shouldn't be used for these cases as a simple replacement
! of DLAED4.
!
! The arguments are the same as in DLAED4 (with the exception of the INFO argument):
!
!
! N (input) INTEGER
! The length of all arrays.
!
! I (input) INTEGER
! The index of the eigenvalue to be computed. 1 <= I <= N.
!
! D (input) DOUBLE PRECISION array, dimension (N)
! The original eigenvalues. It is assumed that they are in
! order, D(I) < D(J) for I < J.
!
! Z (input) DOUBLE PRECISION array, dimension (N)
! The components of the updating Vector.
!
! DELTA (output) DOUBLE PRECISION array, dimension (N)
! DELTA contains (D(j) - lambda_I) in its j-th component.
! See remark above about DLAED4 compatibility!
!
! RHO (input) DOUBLE PRECISION
! The scalar in the symmetric updating formula.
!
! DLAM (output) DOUBLE PRECISION
! The computed lambda_I, the I-th updated eigenvalue.
!-------------------------------------------------------------------------------
use
precision
use
elpa_abstract_impl
...
...
@@ -238,19 +238,19 @@
delta
(:)
=
delta
(:)
-
x
call
obj
%
timer
%
stop
(
"solve_secular_equation"
//
PRECISION_SUFFIX
)
end
subroutine
solve_secular_equation_
&
&
PRECISION
!-------------------------------------------------------------------------------
end
subroutine
solve_secular_equation_
&
&
PRECISION
!-------------------------------------------------------------------------------
#endif
#if REALCASE == 1
subroutine
hh_transform_real_
&
subroutine
hh_transform_real_
&
#endif
#if COMPLEXCASE == 1
subroutine
hh_transform_complex_
&
subroutine
hh_transform_complex_
&
#endif
&
PRECISION
&
(
obj
,
alpha
,
xnorm_sq
,
xf
,
tau
,
wantDebug
)
&
PRECISION
&
(
obj
,
alpha
,
xnorm_sq
,
xf
,
tau
,
wantDebug
)
#if REALCASE == 1
! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
#endif
...
...
@@ -353,7 +353,7 @@
&
PRECISION_SUFFIX
)
#if REALCASE == 1
end
subroutine
hh_transform_real_
&
end
subroutine
hh_transform_real_
&
#endif
#if COMPLEXCASE == 1
end
subroutine
hh_transform_complex_
&
...
...
src/elpa1/elpa1_trans_ev_template.F90
View file @
69972fbf
...
...
@@ -88,11 +88,11 @@
!> \param useGPU If true, GPU version of the subroutine will be used
!>
subroutine
trans_ev_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
(
obj
,
na
,
nqc
,
a_mat
,
lda
,
tau
,
q_mat
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
useGPU
)
subroutine
trans_ev_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
(
obj
,
na
,
nqc
,
a_mat
,
lda
,
tau
,
q_mat
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
useGPU
)
use
cuda_functions
use
iso_c_binding
use
precision
...
...
@@ -553,7 +553,7 @@
&
PRECISION_SUFFIX
//
&
gpuString
)
end
subroutine
trans_ev_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
end
subroutine
trans_ev_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
src/elpa1/elpa1_tridiag_template.F90
View file @
69972fbf
...
...
@@ -882,7 +882,7 @@ subroutine tridiag_&
if
(
useGPU
)
then
if
(
.not.
mat_vec_as_one_block
)
then
if
(
.not.
mat_vec_as_one_block
)
then
! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
! we than use the same calls as for CPU version
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"cublas"
)
...
...
@@ -911,7 +911,7 @@ subroutine tridiag_&
enddo
if
(
useGPU
)
then
if
(
mat_vec_as_one_block
)
then
if
(
mat_vec_as_one_block
)
then
!update whole (remaining) part of matrix, including tiles below diagonal
!we can do that in one large cublas call
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"cublas"
)
...
...
@@ -970,7 +970,7 @@ subroutine tridiag_&
if
(
my_pcol
==
pcol
(
2
,
nblk
,
np_cols
))
then
if
(
my_prow
==
prow
(
1
,
nblk
,
np_rows
))
then
! We use last l_cols value of loop above
if
(
useGPU
)
then
if
(
useGPU
)
then
successCUDA
=
cuda_memcpy
(
int
(
loc
(
aux3
(
1
)),
kind
=
c_intptr_t
),
a_dev
+
(
matrixRows
*
(
l_cols
-
1
))
*
size_of_datatype
,
&
1
*
size_of_datatype
,
cudaMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: a_dev 5"
,
successCUDA
)
...
...
@@ -987,8 +987,6 @@ subroutine tridiag_&
#if COMPLEXCASE == 1
e_vec
(
1
)
=
real
(
vrl
,
kind
=
rk
)
#endif
a_mat
(
1
,
l_cols
)
=
1.
! for consistency only
endif
#ifdef WITH_MPI
...
...
@@ -1008,7 +1006,7 @@ subroutine tridiag_&
#endif /* WITH_MPI */
if
(
my_prow
==
prow
(
1
,
nblk
,
np_rows
)
.and.
my_pcol
==
pcol
(
1
,
nblk
,
np_cols
))
then
if
(
useGPU
)
then
if
(
useGPU
)
then
successCUDA
=
cuda_memcpy
(
int
(
loc
(
aux3
(
1
)),
kind
=
c_intptr_t
),
a_dev
,
&
1
*
size_of_datatype
,
cudaMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: a_dev 6"
,
successCUDA
)
...
...
@@ -1024,7 +1022,7 @@ subroutine tridiag_&
! Store e_vec(1)
if
(
my_prow
==
prow
(
1
,
nblk
,
np_rows
)
.and.
my_pcol
==
pcol
(
2
,
nblk
,
np_cols
))
then
if
(
useGPU
)
then
if
(
useGPU
)
then
successCUDA
=
cuda_memcpy
(
int
(
loc
(
e_vec
(
1
)),
kind
=
c_intptr_t
),
a_dev
+
(
matrixRows
*
(
l_cols
-
1
))
*
size_of_datatype
,
&
1
*
size_of_datatype
,
cudaMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: a_dev 7"
,
successCUDA
)
...
...
@@ -1145,7 +1143,7 @@ subroutine tridiag_&
PRECISION_SUFFIX
//
&
gpuString
)
end
subroutine
tridiag_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
end
subroutine
tridiag_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment