Commit ab3f568f authored by Andreas Marek's avatar Andreas Marek
Browse files

Re-integrate some changes from NVIDIA

parent 0adedef7
......@@ -428,7 +428,7 @@ AC_MSG_RESULT([${fortran_can_check_environment}])
dnl check whether GPU version is requested
CUDA_INSTALL_PATH="/usr/local/cuda/"
#CUDA_INSTALL_PATH="/usr/local/cuda/"
#CUDA_SDK_INSTALL_PATH="/usr/local/NVIDIA_GPU_Computing_SDK"
AC_MSG_CHECKING(whether GPU support is requested)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
# Helper functions for option handling. -*- Autoconf -*-
#
# Copyright (C) 2004, 2005, 2007, 2008, 2009 Free Software Foundation,
# Inc.
# Copyright (C) 2004-2005, 2007-2009, 2011-2015 Free Software
# Foundation, Inc.
# Written by Gary V. Vaughan, 2004
#
# This file is free software; the Free Software Foundation gives
# unlimited permission to copy and/or distribute it, with or without
# modifications, as long as this notice is preserved.
# serial 7 ltoptions.m4
# serial 8 ltoptions.m4
# This is to help aclocal find these macros, as it can't see m4_define.
AC_DEFUN([LTOPTIONS_VERSION], [m4_if([1])])
......@@ -29,7 +29,7 @@ m4_define([_LT_SET_OPTION],
[m4_define(_LT_MANGLE_OPTION([$1], [$2]))dnl
m4_ifdef(_LT_MANGLE_DEFUN([$1], [$2]),
_LT_MANGLE_DEFUN([$1], [$2]),
[m4_warning([Unknown $1 option `$2'])])[]dnl
[m4_warning([Unknown $1 option '$2'])])[]dnl
])
......@@ -75,13 +75,15 @@ m4_if([$1],[LT_INIT],[
dnl
dnl If no reference was made to various pairs of opposing options, then
dnl we run the default mode handler for the pair. For example, if neither
dnl `shared' nor `disable-shared' was passed, we enable building of shared
dnl 'shared' nor 'disable-shared' was passed, we enable building of shared
dnl archives by default:
_LT_UNLESS_OPTIONS([LT_INIT], [shared disable-shared], [_LT_ENABLE_SHARED])
_LT_UNLESS_OPTIONS([LT_INIT], [static disable-static], [_LT_ENABLE_STATIC])
_LT_UNLESS_OPTIONS([LT_INIT], [pic-only no-pic], [_LT_WITH_PIC])
_LT_UNLESS_OPTIONS([LT_INIT], [fast-install disable-fast-install],
[_LT_ENABLE_FAST_INSTALL])
[_LT_ENABLE_FAST_INSTALL])
_LT_UNLESS_OPTIONS([LT_INIT], [aix-soname=aix aix-soname=both aix-soname=svr4],
[_LT_WITH_AIX_SONAME([aix])])
])
])# _LT_SET_OPTIONS
......@@ -112,7 +114,7 @@ AU_DEFUN([AC_LIBTOOL_DLOPEN],
[_LT_SET_OPTION([LT_INIT], [dlopen])
AC_DIAGNOSE([obsolete],
[$0: Remove this warning and the call to _LT_SET_OPTION when you
put the `dlopen' option into LT_INIT's first parameter.])
put the 'dlopen' option into LT_INIT's first parameter.])
])
dnl aclocal-1.4 backwards compatibility:
......@@ -148,7 +150,7 @@ AU_DEFUN([AC_LIBTOOL_WIN32_DLL],
_LT_SET_OPTION([LT_INIT], [win32-dll])
AC_DIAGNOSE([obsolete],
[$0: Remove this warning and the call to _LT_SET_OPTION when you
put the `win32-dll' option into LT_INIT's first parameter.])
put the 'win32-dll' option into LT_INIT's first parameter.])
])
dnl aclocal-1.4 backwards compatibility:
......@@ -157,9 +159,9 @@ dnl AC_DEFUN([AC_LIBTOOL_WIN32_DLL], [])
# _LT_ENABLE_SHARED([DEFAULT])
# ----------------------------
# implement the --enable-shared flag, and supports the `shared' and
# `disable-shared' LT_INIT options.
# DEFAULT is either `yes' or `no'. If omitted, it defaults to `yes'.
# implement the --enable-shared flag, and supports the 'shared' and
# 'disable-shared' LT_INIT options.
# DEFAULT is either 'yes' or 'no'. If omitted, it defaults to 'yes'.
m4_define([_LT_ENABLE_SHARED],
[m4_define([_LT_ENABLE_SHARED_DEFAULT], [m4_if($1, no, no, yes)])dnl
AC_ARG_ENABLE([shared],
......@@ -172,14 +174,14 @@ AC_ARG_ENABLE([shared],
*)
enable_shared=no
# Look at the argument we got. We use all the common list separators.
lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR,
for pkg in $enableval; do
IFS="$lt_save_ifs"
IFS=$lt_save_ifs
if test "X$pkg" = "X$p"; then
enable_shared=yes
fi
done
IFS="$lt_save_ifs"
IFS=$lt_save_ifs
;;
esac],
[enable_shared=]_LT_ENABLE_SHARED_DEFAULT)
......@@ -211,9 +213,9 @@ dnl AC_DEFUN([AM_DISABLE_SHARED], [])
# _LT_ENABLE_STATIC([DEFAULT])
# ----------------------------
# implement the --enable-static flag, and support the `static' and
# `disable-static' LT_INIT options.
# DEFAULT is either `yes' or `no'. If omitted, it defaults to `yes'.
# implement the --enable-static flag, and support the 'static' and
# 'disable-static' LT_INIT options.
# DEFAULT is either 'yes' or 'no'. If omitted, it defaults to 'yes'.
m4_define([_LT_ENABLE_STATIC],
[m4_define([_LT_ENABLE_STATIC_DEFAULT], [m4_if($1, no, no, yes)])dnl
AC_ARG_ENABLE([static],
......@@ -226,14 +228,14 @@ AC_ARG_ENABLE([static],
*)
enable_static=no
# Look at the argument we got. We use all the common list separators.
lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR,
for pkg in $enableval; do
IFS="$lt_save_ifs"
IFS=$lt_save_ifs
if test "X$pkg" = "X$p"; then
enable_static=yes
fi
done
IFS="$lt_save_ifs"
IFS=$lt_save_ifs
;;
esac],
[enable_static=]_LT_ENABLE_STATIC_DEFAULT)
......@@ -265,9 +267,9 @@ dnl AC_DEFUN([AM_DISABLE_STATIC], [])
# _LT_ENABLE_FAST_INSTALL([DEFAULT])
# ----------------------------------
# implement the --enable-fast-install flag, and support the `fast-install'
# and `disable-fast-install' LT_INIT options.
# DEFAULT is either `yes' or `no'. If omitted, it defaults to `yes'.
# implement the --enable-fast-install flag, and support the 'fast-install'
# and 'disable-fast-install' LT_INIT options.
# DEFAULT is either 'yes' or 'no'. If omitted, it defaults to 'yes'.
m4_define([_LT_ENABLE_FAST_INSTALL],
[m4_define([_LT_ENABLE_FAST_INSTALL_DEFAULT], [m4_if($1, no, no, yes)])dnl
AC_ARG_ENABLE([fast-install],
......@@ -280,14 +282,14 @@ AC_ARG_ENABLE([fast-install],
*)
enable_fast_install=no
# Look at the argument we got. We use all the common list separators.
lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR,
for pkg in $enableval; do
IFS="$lt_save_ifs"
IFS=$lt_save_ifs
if test "X$pkg" = "X$p"; then
enable_fast_install=yes
fi
done
IFS="$lt_save_ifs"
IFS=$lt_save_ifs
;;
esac],
[enable_fast_install=]_LT_ENABLE_FAST_INSTALL_DEFAULT)
......@@ -304,14 +306,14 @@ AU_DEFUN([AC_ENABLE_FAST_INSTALL],
[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[fast-install])
AC_DIAGNOSE([obsolete],
[$0: Remove this warning and the call to _LT_SET_OPTION when you put
the `fast-install' option into LT_INIT's first parameter.])
the 'fast-install' option into LT_INIT's first parameter.])
])
AU_DEFUN([AC_DISABLE_FAST_INSTALL],
[_LT_SET_OPTION([LT_INIT], [disable-fast-install])
AC_DIAGNOSE([obsolete],
[$0: Remove this warning and the call to _LT_SET_OPTION when you put
the `disable-fast-install' option into LT_INIT's first parameter.])
the 'disable-fast-install' option into LT_INIT's first parameter.])
])
dnl aclocal-1.4 backwards compatibility:
......@@ -319,11 +321,64 @@ dnl AC_DEFUN([AC_ENABLE_FAST_INSTALL], [])
dnl AC_DEFUN([AM_DISABLE_FAST_INSTALL], [])
# _LT_WITH_AIX_SONAME([DEFAULT])
# ----------------------------------
# implement the --with-aix-soname flag, and support the `aix-soname=aix'
# and `aix-soname=both' and `aix-soname=svr4' LT_INIT options. DEFAULT
# is either `aix', `both' or `svr4'. If omitted, it defaults to `aix'.
m4_define([_LT_WITH_AIX_SONAME],
[m4_define([_LT_WITH_AIX_SONAME_DEFAULT], [m4_if($1, svr4, svr4, m4_if($1, both, both, aix))])dnl
shared_archive_member_spec=
case $host,$enable_shared in
power*-*-aix[[5-9]]*,yes)
AC_MSG_CHECKING([which variant of shared library versioning to provide])
AC_ARG_WITH([aix-soname],
[AS_HELP_STRING([--with-aix-soname=aix|svr4|both],
[shared library versioning (aka "SONAME") variant to provide on AIX, @<:@default=]_LT_WITH_AIX_SONAME_DEFAULT[@:>@.])],
[case $withval in
aix|svr4|both)
;;
*)
AC_MSG_ERROR([Unknown argument to --with-aix-soname])
;;
esac
lt_cv_with_aix_soname=$with_aix_soname],
[AC_CACHE_VAL([lt_cv_with_aix_soname],
[lt_cv_with_aix_soname=]_LT_WITH_AIX_SONAME_DEFAULT)
with_aix_soname=$lt_cv_with_aix_soname])
AC_MSG_RESULT([$with_aix_soname])
if test aix != "$with_aix_soname"; then
# For the AIX way of multilib, we name the shared archive member
# based on the bitwidth used, traditionally 'shr.o' or 'shr_64.o',
# and 'shr.imp' or 'shr_64.imp', respectively, for the Import File.
# Even when GNU compilers ignore OBJECT_MODE but need '-maix64' flag,
# the AIX toolchain works better with OBJECT_MODE set (default 32).
if test 64 = "${OBJECT_MODE-32}"; then
shared_archive_member_spec=shr_64
else
shared_archive_member_spec=shr
fi
fi
;;
*)
with_aix_soname=aix
;;
esac
_LT_DECL([], [shared_archive_member_spec], [0],
[Shared archive member basename, for filename based shared library versioning on AIX])dnl
])# _LT_WITH_AIX_SONAME
LT_OPTION_DEFINE([LT_INIT], [aix-soname=aix], [_LT_WITH_AIX_SONAME([aix])])
LT_OPTION_DEFINE([LT_INIT], [aix-soname=both], [_LT_WITH_AIX_SONAME([both])])
LT_OPTION_DEFINE([LT_INIT], [aix-soname=svr4], [_LT_WITH_AIX_SONAME([svr4])])
# _LT_WITH_PIC([MODE])
# --------------------
# implement the --with-pic flag, and support the `pic-only' and `no-pic'
# implement the --with-pic flag, and support the 'pic-only' and 'no-pic'
# LT_INIT options.
# MODE is either `yes' or `no'. If omitted, it defaults to `both'.
# MODE is either 'yes' or 'no'. If omitted, it defaults to 'both'.
m4_define([_LT_WITH_PIC],
[AC_ARG_WITH([pic],
[AS_HELP_STRING([--with-pic@<:@=PKGS@:>@],
......@@ -334,19 +389,17 @@ m4_define([_LT_WITH_PIC],
*)
pic_mode=default
# Look at the argument we got. We use all the common list separators.
lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR,
for lt_pkg in $withval; do
IFS="$lt_save_ifs"
IFS=$lt_save_ifs
if test "X$lt_pkg" = "X$lt_p"; then
pic_mode=yes
fi
done
IFS="$lt_save_ifs"
IFS=$lt_save_ifs
;;
esac],
[pic_mode=default])
test -z "$pic_mode" && pic_mode=m4_default([$1], [default])
[pic_mode=m4_default([$1], [default])])
_LT_DECL([], [pic_mode], [0], [What type of objects to build])dnl
])# _LT_WITH_PIC
......@@ -359,7 +412,7 @@ AU_DEFUN([AC_LIBTOOL_PICMODE],
[_LT_SET_OPTION([LT_INIT], [pic-only])
AC_DIAGNOSE([obsolete],
[$0: Remove this warning and the call to _LT_SET_OPTION when you
put the `pic-only' option into LT_INIT's first parameter.])
put the 'pic-only' option into LT_INIT's first parameter.])
])
dnl aclocal-1.4 backwards compatibility:
......
# ltsugar.m4 -- libtool m4 base layer. -*-Autoconf-*-
#
# Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
# Copyright (C) 2004-2005, 2007-2008, 2011-2015 Free Software
# Foundation, Inc.
# Written by Gary V. Vaughan, 2004
#
# This file is free software; the Free Software Foundation gives
......@@ -33,7 +34,7 @@ m4_define([_lt_join],
# ------------
# Manipulate m4 lists.
# These macros are necessary as long as will still need to support
# Autoconf-2.59 which quotes differently.
# Autoconf-2.59, which quotes differently.
m4_define([lt_car], [[$1]])
m4_define([lt_cdr],
[m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])],
......@@ -44,7 +45,7 @@ m4_define([lt_unquote], $1)
# lt_append(MACRO-NAME, STRING, [SEPARATOR])
# ------------------------------------------
# Redefine MACRO-NAME to hold its former content plus `SEPARATOR'`STRING'.
# Redefine MACRO-NAME to hold its former content plus 'SEPARATOR''STRING'.
# Note that neither SEPARATOR nor STRING are expanded; they are appended
# to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked).
# No SEPARATOR is output if MACRO-NAME was previously undefined (different
......
# ltversion.m4 -- version numbers -*- Autoconf -*-
#
# Copyright (C) 2004 Free Software Foundation, Inc.
# Copyright (C) 2004, 2011-2015 Free Software Foundation, Inc.
# Written by Scott James Remnant, 2004
#
# This file is free software; the Free Software Foundation gives
......@@ -9,15 +9,15 @@
# @configure_input@
# serial 3337 ltversion.m4
# serial 4179 ltversion.m4
# This file is part of GNU Libtool
m4_define([LT_PACKAGE_VERSION], [2.4.2])
m4_define([LT_PACKAGE_REVISION], [1.3337])
m4_define([LT_PACKAGE_VERSION], [2.4.6])
m4_define([LT_PACKAGE_REVISION], [2.4.6])
AC_DEFUN([LTVERSION_VERSION],
[macro_version='2.4.2'
macro_revision='1.3337'
[macro_version='2.4.6'
macro_revision='2.4.6'
_LT_DECL(, macro_version, 0, [Which release of libtool.m4 was used?])
_LT_DECL(, macro_revision, 0)
])
# lt~obsolete.m4 -- aclocal satisfying obsolete definitions. -*-Autoconf-*-
#
# Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc.
# Copyright (C) 2004-2005, 2007, 2009, 2011-2015 Free Software
# Foundation, Inc.
# Written by Scott James Remnant, 2004.
#
# This file is free software; the Free Software Foundation gives
......@@ -11,7 +12,7 @@
# These exist entirely to fool aclocal when bootstrapping libtool.
#
# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN)
# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN),
# which have later been changed to m4_define as they aren't part of the
# exported API, or moved to Autoconf or Automake where they belong.
#
......@@ -25,7 +26,7 @@
# included after everything else. This provides aclocal with the
# AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything
# because those macros already exist, or will be overwritten later.
# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6.
# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6.
#
# Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here.
# Yes, that means every name once taken will need to remain here until
......
......@@ -125,7 +125,7 @@ module ELPA2
!******
contains
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, na_rows, na_cols, &
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, na_rows, matrixCols, &
mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, THIS_REAL_ELPA_KERNEL_API, &
useQR) result(success)
......@@ -1213,6 +1213,9 @@ subroutine bandred_real(na, a, lda, nblk, nbw, na_rows, na_cols, mpi_comm_rows,
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
istat = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*8_8,cudaMemcpyHostToDevice)
if(istat .ne. 0) print *, " cuad memcpy failed vav_dev ", istat
call cublas_dgemm('T','N',n_cols,n_cols,l_cols, &
1.d0, umc_dev,cur_l_cols,(umc_dev+(cur_l_cols * n_cols )*8_8),cur_l_cols, &
......@@ -1426,7 +1429,7 @@ end subroutine symm_matrix_allreduce
!-------------------------------------------------------------------------------
subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, na_rows, na_cols, mpi_comm_rows, &
subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, mpi_comm_rows, &
mpi_comm_cols, useQR)
......@@ -1472,9 +1475,9 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
#endif
implicit none
integer, intent(in) :: na, nqc, lda, ldq, nblk, nbw, na_rows, na_cols, mpi_comm_rows, mpi_comm_cols
integer, intent(in) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols
real*8, intent(in) :: a(:,:), tmat(:,:,:)
real*8, intent(inout):: q(:,:)
real*8, intent(inout):: q(ldq,matrixCols)
real*8, allocatable :: q_temp(:,:), tmat_temp(:,:)
integer :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer :: max_blocks_row, max_blocks_col, max_local_rows, &
......@@ -1579,13 +1582,19 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
stop
endif
#ifdef WITH_GPU_VERSION
allocate(q_temp(ldq,max_local_cols), stat=istat, errmsg=errorMessage)
allocate(hvm(max_local_rows,nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error when allocating q_temp "//errorMessage
print *,"error when allocating hvm "//errorMessage
stop
endif
#ifdef WITH_GPU_VERSION
! allocate(q_temp(ldq,max_local_cols), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"error when allocating q_temp "//errorMessage
! stop
! endif
allocate(tmat_temp(nbw,nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error when allocating tmat_temp "//errorMessage
......@@ -1610,16 +1619,16 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
stop
endif
istat = cuda_malloc(q_dev, ldq*max_local_cols*8_8)
istat = cuda_malloc(q_dev, ldq*matrixCols*8_8)
if (istat .ne. 0) then
print *,"error in cudaMalloc"
stop
endif
q_temp(:,:) = 0.0
q_temp(1:ldq,1:na_cols) = q(1:ldq,1:na_cols)
! q_temp(:,:) = 0.0
! q_temp(1:ldq,1:na_cols) = q(1:ldq,1:na_cols)
istat = cuda_memcpy(q_dev, loc(q_temp), (ldq)*(max_local_cols)*8_8, cudaMemcpyHostToDevice)
istat = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*8_8, cudaMemcpyHostToDevice)
if (istat .ne. 0) then
print *,"error in cudaMalloc"
stop
......@@ -1632,11 +1641,7 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
endif
#endif
allocate(hvm(max_local_rows,nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error when allocating hvm "//errorMessage
stop
endif
endif ! QR
......@@ -1771,7 +1776,7 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
#ifdef WITH_GPU_VERSION
istat = cuda_memcpy(hvm_dev, loc(hvm), ((max_local_rows)*nbw*8_8),cudaMemcpyHostToDevice)
if (istat .ne. 0) then
print *,"error in cudaMemcpy"
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
#endif
......@@ -1783,40 +1788,53 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
#ifdef WITH_GPU_VERSION
call cublas_dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm_dev,max_local_rows, &
q_dev,ldq ,0.d0,tmp_dev,n_cols)
istat = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*8_8, cudaMemcpyDeviceToHost)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
#else
call dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm,ubound(hvm,1), &
q,ldq,0.d0,tmp1,n_cols)
#endif
else
#ifdef WITH_GPU_VERSION
istat = cuda_memset(tmp_dev, 0, l_cols*n_cols*8_8)
if (istat .ne. 0) then
print *,"error in cudaMemset"
stop
endif
#else
!#ifdef WITH_GPU_VERSION
! istat = cuda_memset(tmp_dev, 0, l_cols*n_cols*8_8)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_real: error in cudaMemset"
! stop
! endif
!
!#else
tmp1(1:l_cols*n_cols) = 0
#endif
!#endif
endif
#ifdef WITH_GPU_VERSION
istat = cuda_memcpy(loc(tmp1), tmp_dev, max_local_cols*nbw*8_8,cudaMemcpyDeviceToHost)
if (istat .ne. 0) then
print *,"error in cudaMemcpy"
stop
endif
#endif
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(loc(tmp1), tmp_dev, max_local_cols*nbw*8_8,cudaMemcpyDeviceToHost)
! if (istat .ne. 0) then
! print *,"error in cudaMemcpy"
! stop
! endif
!#endif
call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#ifdef WITH_GPU_VERSION
istat = cuda_memcpy(tmp_dev, loc(tmp2), max_local_cols*nbw*8_8,cudaMemcpyHostToDevice)
if (istat .ne. 0) then
print *,"error in cudaMemcpy"
stop
endif
#endif
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(tmp_dev, loc(tmp2), max_local_cols*nbw*8_8,cudaMemcpyHostToDevice)
! if (istat .ne. 0) then
! print *,"error in cudaMemcpy"
! stop
! endif
!#endif
if (l_rows>0) then
#ifdef WITH_GPU_VERSION
istat = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols*8_8,cudaMemcpyHostToDevice)
if (istat .ne. 0) then
print *,"error in cudaMemcpy"
stop
endif
istat = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*8_8,cudaMemcpyHostToDevice)
if (istat .ne. 0) then
print *,"error in cudaMemcpy"
......@@ -1826,20 +1844,27 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
call cublas_dtrmm('L','U','T','N',n_cols,l_cols,1.0d0, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm_dev,max_local_rows, &
tmp_dev,n_cols,1.d0,q_dev,ldq)
istat = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*8_8),cudaMemcpyDeviceToHost)
if (istat .ne. 0) then
print *,"error in cudaMemcpy"
stop
endif
#else
call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat(1,1,istep),ubound(tmat,1),tmp2,n_cols)
call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,1), &
tmp2,n_cols,1.d0,q,ldq)
#endif
endif
#ifdef WITH_GPU_VERSION
istat = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*8_8),cudaMemcpyDeviceToHost)
if (istat .ne. 0) then
print *,"error in cudaMemcpy"
stop
endif
#endif
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*8_8),cudaMemcpyDeviceToHost)
! if (istat .ne. 0) then
! print *,"error in cudaMemcpy"
! stop
! endif
!
!#endif
enddo
endif ! endQR
......@@ -1868,13 +1893,13 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
stop
endif
istat = cuda_memcpy(loc(q_temp), q_dev, ldq*max_local_cols*8_8, cudaMemcpyDeviceToHost)
istat = cuda_memcpy(loc(q), q_dev, ldq*nqc*8_8, cudaMemcpyDeviceToHost)
if (istat .ne. 0) then
print *,"error in cudaFree"
stop
endif
q(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols)
! q(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols)
istat = cuda_free(q_dev)
if (istat .ne. 0) then
......@@ -1882,11 +1907,11 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
stop
endif
deallocate(q_temp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error when deallocating q_temp "//errorMessage
stop
endif
! deallocate(q_temp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"error when deallocating q_temp "//errorMessage
! stop
! endif
deallocate(tmat_temp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -2780,7 +2805,7 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, &
integer(c_size_t) :: a_dev
integer(c_size_t) :: bcast_buffer_dev
integer(c_size_t) :: num
integer(c_size_t) :: dev_offset, host_offset,dev_offset_1
integer(c_size_t) :: dev_offset, dev_offset_1
integer(c_size_t) :: row_dev
......@@ -3482,7 +3507,7 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, &
call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
n_off = current_local_n+a_off
#ifdef WITH_GPU_VERSION
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) *8
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) *8_8
istat = cuda_memcpy( a_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)) ,stripe_width*nbw*8_8 ,1)
if (istat .ne. 0) then
print *,"error in cudaMemcpy"
......@@ -3526,8 +3551,8 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, &
call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef WITH_GPU_VERSION
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *8
host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *8_8
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
istat = cuda_memcpy( a_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)),stripe_width*top_msg_length*8_8 ,1)
if (istat .ne. 0) then
print *,"error in cudaMemcpy"
......@@ -3593,7 +3618,7 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, &
if (bottom_msg_length>