From de6a4fde71caf681d35b6d13b032de7e362457a3 Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Thu, 11 Feb 2016 07:37:58 +0100 Subject: [PATCH] Enable single-precision calculations for ELPA1 With the configure option "--enable-single-precision" ELPA1 is build with single-precision (half-words) only. The best precision in single-precision (float or complex) is 2^-23 ~ 1.2e-7. The accuracy of the error residual of ELPA1 in single-precision mode is of the order 1e-4 to 1e-5. The orthogonality of the EV's is fullfilled up to about ~1e-6. Thus the precision of ELPA1 in single-precision mode is roughly 100 - 1000 times less than the best achievable precison. This is consistent with the double-precision mode, where also a factor of 100 - 1000 less precision than the theoretical best one is found. The float EVs are identical to the double EVs to at least 1e-2, the precision of the EVs is thus about 1e-7/1e-2 = 1e5 times lower than the best theoretical precision. If the same holds for the double precision calculations, this implies that the double precision results can also be only trusted on the level 1e-11 (5 orders of magnitude larger than the best theoretical precision) The best speed-up compared to the double precision calculation is a factor of two. This is by far not achieved yet, since the singl precision version is not at all optimized at the moment --- Makefile.am | 7 +- configure.ac | 15 +- src/elpa1.F90 | 13 +- src/elpa1_compute.F90 | 843 ++++++++++++++---- src/elpa_c_interface.F90 | 74 +- src/elpa_reduce_add_vectors.X90 | 18 +- src/elpa_transpose_vectors.X90 | 19 +- src/{mod_precision.f90 => mod_precision.F90} | 11 +- src/redist_band.X90 | 20 +- .../elpa1_test_complex_c_version.c | 48 +- .../elpa1_test_real_c_version.c | 46 +- test/fortran_test_programs/test_complex.F90 | 16 + test/fortran_test_programs/test_real.F90 | 15 + test/shared_sources/call_elpa1.c | 17 +- test/shared_sources/check_correctnes.F90 | 186 +++- test/shared_sources/mod_from_c.F90 | 24 +- test/shared_sources/prepare_matrix.F90 | 60 +- 17 files changed, 1130 insertions(+), 302 deletions(-) rename src/{mod_precision.f90 => mod_precision.F90} (58%) diff --git a/Makefile.am b/Makefile.am index cffd291c..222c9fd2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -9,7 +9,7 @@ AM_LDFLAGS = $(SCALAPACK_LDFLAGS) lib_LTLIBRARIES = libelpa@SUFFIX@.la libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++ -libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \ +libelpa@SUFFIX@_la_SOURCES = src/mod_precision.F90 \ src/elpa_utilities.F90 \ src/elpa1_compute.F90 \ src/elpa1.F90 \ @@ -310,6 +310,8 @@ elpa2_test_complex_default_kernel.sh: elpa2_test_complex_choose_kernel_with_api.sh: echo 'mpiexec -n 2 ./elpa2_test_complex_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api.sh chmod +x elpa2_test_complex_choose_kernel_with_api.sh +mod_precision.i: $(top_srcdir)/src/mod_precision.F90 + $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/mod_precision.F90 -o $@ elpa2_utilities.i: $(top_srcdir)/src/elpa2_utilities.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa2_utilities.F90 -o $@ @@ -320,6 +322,9 @@ elpa2.i: $(top_srcdir)/src/elpa2.F90 elpa1.i: $(top_srcdir)/src/elpa1.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@ +elpa1_compute.i: $(top_srcdir)/src/elpa1_compute.F90 + $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa1_compute.F90 -o $@ + elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 -o $@ diff --git a/configure.ac b/configure.ac index 1225c97a..e1b184e8 100644 --- a/configure.ac +++ b/configure.ac @@ -509,6 +509,14 @@ if test x"${want_gpu}" = x"yes" ; then can_compile_gpu=yes fi +dnl check whether single precision is requested +AC_MSG_CHECKING(whether single precision calculations are requested) +AC_ARG_ENABLE(single-precision,[AS_HELP_STRING([--enable-single-precision], + [build with single precision])], + want_single_precision="yes", want_single_precision="no") +AC_MSG_RESULT([${want_single_precision}]) + + dnl now check which kernels can be compiled dnl the checks for SSE were already done before @@ -722,10 +730,15 @@ DX_HTML_FEATURE(ON) DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs]) DESPERATELY_WANT_ASSUMED_SIZE=0 -if text x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then +if test x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable]) fi +if test x"${want_single_precision}" = x"no" ; then + AC_DEFINE([DOUBLE_PRECISION_REAL],[1],[use double precision for real calculation]) + AC_DEFINE([DOUBLE_PRECISION_COMPLEX],[1],[use double precision for complex calculation]) +fi + AC_SUBST([WITH_MKL]) AC_SUBST([WITH_BLACS]) AC_SUBST([with_amd_bulldozer_kernel]) diff --git a/src/elpa1.F90 b/src/elpa1.F90 index 44b6e5f0..bbf725da 100644 --- a/src/elpa1.F90 +++ b/src/elpa1.F90 @@ -88,6 +88,7 @@ module ELPA1 #ifdef HAVE_DETAILED_TIMINGS use timings #endif + use iso_c_binding implicit none PRIVATE ! By default, all routines contained are private @@ -104,9 +105,9 @@ module ELPA1 ! Timing results, set by every call to solve_evp_xxx - real(kind=rk), public :: time_evp_fwd !< time for forward transformations (to tridiagonal form) - real(kind=rk), public :: time_evp_solve !< time for solving the tridiagonal system - real(kind=rk), public :: time_evp_back !< time for back transformations of eigenvectors + real(kind=c_double), public :: time_evp_fwd !< time for forward transformations (to tridiagonal form) + real(kind=c_double), public :: time_evp_solve !< time for solving the tridiagonal system + real(kind=c_double), public :: time_evp_back !< time for back transformations of eigenvectors logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs @@ -294,6 +295,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp #ifdef HAVE_DETAILED_TIMINGS use timings #endif + use iso_c_binding implicit none integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols @@ -303,7 +305,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp integer(kind=ik) :: my_prow, my_pcol, mpierr real(kind=rk), allocatable :: e(:), tau(:) - real(kind=rk) :: ttt0, ttt1 + real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double logical :: success logical, save :: firstCall = .true. logical :: wantDebug @@ -395,6 +397,7 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, use timings #endif use precision + use iso_c_binding implicit none integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols @@ -407,7 +410,7 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, integer(kind=ik) :: l_rows, l_cols, l_cols_nev real(kind=rk), allocatable :: q_real(:,:), e(:) complex(kind=ck), allocatable :: tau(:) - real(kind=rk) :: ttt0, ttt1 + real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double logical :: success logical, save :: firstCall = .true. diff --git a/src/elpa1_compute.F90 b/src/elpa1_compute.F90 index 1dfb913f..e829b13a 100644 --- a/src/elpa1_compute.F90 +++ b/src/elpa1_compute.F90 @@ -89,6 +89,7 @@ module ELPA1_compute include 'mpif.h' contains +#ifdef DOUBLE_PRECISION_REAL #define DATATYPE REAL(kind=rk) #define BYTESIZE 8 @@ -99,6 +100,19 @@ module ELPA1_compute #undef BYTESIZE #undef REALCASE +#else + +#define DATATYPE REAL(kind=rk) +#define BYTESIZE 4 +#define REALCASE 1 +#include "elpa_transpose_vectors.X90" +#include "elpa_reduce_add_vectors.X90" +#undef DATATYPE +#undef BYTESIZE +#undef REALCASE + +#endif /* DOUBLE_PRECISION_REAL */ + subroutine tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) !------------------------------------------------------------------------------- @@ -239,8 +253,14 @@ module ELPA1_compute vr(1:l_rows) = a(1:l_rows,l_cols+1) if(nstor>0 .and. l_rows>0) then - call DGEMV('N',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1), & - uvc(l_cols+1,1),ubound(uvc,dim=1),1.d0,vr,1) +#ifdef DOUBLE_PRECISION_REAL + call DGEMV('N', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), & + uvc(l_cols+1,1), ubound(uvc,dim=1), 1.0_rk, vr, 1) +#else + call SGEMV('N', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), & + uvc(l_cols+1,1), ubound(uvc,dim=1), 1.0_rk, vr, 1) + +#endif endif if(my_prow==prow(istep-1, nblk, np_rows)) then @@ -251,8 +271,11 @@ module ELPA1_compute aux1(2) = 0. endif - call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) - +#if DOUBLE_PRECISION_REAL + call mpi_allreduce(aux1, aux2, 2, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) +#else + call mpi_allreduce(aux1, aux2, 2, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) +#endif vnorm2 = aux2(1) vrl = aux2(2) @@ -274,7 +297,11 @@ module ELPA1_compute ! Broadcast the Householder vector (and tau) along columns if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep) - call MPI_Bcast(vr,l_rows+1,MPI_REAL8,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(vr, l_rows+1, MPI_REAL8, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call MPI_Bcast(vr, l_rows+1, MPI_REAL4, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) +#endif tau(istep) = vr(l_rows+1) ! Transpose Householder vector vr -> vc @@ -319,15 +346,33 @@ module ELPA1_compute if (lre0) then - call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1),vr,1,0.d0,aux,1) - call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,dim=1),aux,1,1.d0,uc,1) +#ifdef DOUBLE_PRECISION_REAL + call DGEMV('T', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), vr, 1, 0.0_rk, aux, 1) + call DGEMV('N', l_cols, 2*nstor, 1.0_rk, uvc, ubound(uvc,dim=1), aux, 1, 1.0_rk, uc, 1) +#else + call SGEMV('T', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), vr, 1, 0.0_rk, aux, 1) + call SGEMV('N', l_cols, 2*nstor, 1.0_rk, uvc, ubound(uvc,dim=1), aux, 1, 1.0_rk, uc, 1) +#endif endif endif @@ -363,7 +413,12 @@ module ELPA1_compute if (l_cols>0) then tmp(1:l_cols) = uc(1:l_cols) - call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_allreduce(tmp, uc, l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) +#else + call mpi_allreduce(tmp, uc, l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) +#endif + endif call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, & @@ -374,8 +429,11 @@ module ELPA1_compute x = 0 if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols)) - call mpi_allreduce(x,vav,1,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr) - +#ifdef DOUBLE_PRECISION_REAL + call mpi_allreduce(x, vav, 1, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) +#else + call mpi_allreduce(x, vav, 1, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr) +#endif ! store u and v in the matrices U and V ! these matrices are stored combined in one here @@ -400,9 +458,16 @@ module ELPA1_compute lrs = 1 lre = min(l_rows,(i+1)*l_rows_tile) if (lce0) & - call MPI_Bcast(hvb,nb,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr) - +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(hvb, nb, MPI_REAL8, cur_pcol, mpi_comm_cols, mpierr) +#else + call MPI_Bcast(hvb, nb, MPI_REAL4, cur_pcol, mpi_comm_cols, mpierr) +#endif nb = 0 do ic=ics,ice l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector @@ -578,7 +657,12 @@ module ELPA1_compute tmat = 0 if (l_rows>0) & - call dsyrk('U','T',nstor,l_rows,1.d0,hvm,ubound(hvm,dim=1),0.d0,tmat,max_stored_rows) +#ifdef DOUBLE_PRECISION_REAL + call dsyrk('U', 'T', nstor, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), 0.0_rk, tmat, max_stored_rows) +#else + call ssyrk('U', 'T', nstor, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), 0.0_rk, tmat, max_stored_rows) +#endif + nc = 0 do n=1,nstor-1 @@ -586,14 +670,22 @@ module ELPA1_compute nc = nc+n enddo - if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) +#ifdef DOUBLE_PRECISION_REAL + if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) +#else + if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) +#endif ! Calculate triangular matrix T nc = 0 tmat(1,1) = tau(ice-nstor+1) do n=1,nstor-1 - call dtrmv('L','T','N',n,tmat,max_stored_rows,h2(nc+1),1) +#ifdef DOUBLE_PRECISION_REAL + call dtrmv('L', 'T', 'N', n, tmat, max_stored_rows, h2(nc+1), 1) +#else + call strmv('L', 'T', 'N', n, tmat, max_stored_rows, h2(nc+1), 1) +#endif tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1) tmat(n+1,n+1) = tau(ice-nstor+n+1) nc = nc+n @@ -602,17 +694,32 @@ module ELPA1_compute ! Q = Q - V * T * V**T * Q if (l_rows>0) then - call dgemm('T','N',nstor,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), & - q,ldq,0.d0,tmp1,nstor) +#ifdef DOUBLE_PRECISION_REAL + call dgemm('T', 'N', nstor, l_cols, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), & + q, ldq, 0.0_rk, tmp1, nstor) +#else + call sgemm('T', 'N', nstor, l_cols, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), & + q, ldq, 0.0_rk, tmp1, nstor) +#endif + else tmp1(1:l_cols*nstor) = 0 endif - call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) if (l_rows>0) then - call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor) - call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), & - tmp2,nstor,1.d0,q,ldq) + call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk, tmat, max_stored_rows, tmp2, nstor) + call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk, hvm, ubound(hvm,dim=1), & + tmp2, nstor, 1.0_rk, q, ldq) endif +#else + call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) + if (l_rows>0) then + call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk, tmat, max_stored_rows, tmp2, nstor) + call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk, hvm, ubound(hvm,dim=1), & + tmp2, nstor, 1.0_rk, q, ldq) + endif +#endif nstor = 0 endif @@ -809,8 +916,11 @@ module ELPA1_compute enddo ! Broadcast block column - - call MPI_Bcast(aux_bc,n_aux_bc,MPI_REAL8,np_bc,mpi_comm_cols,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(aux_bc, n_aux_bc, MPI_REAL8, np_bc, mpi_comm_cols, mpierr) +#else + call MPI_Bcast(aux_bc, n_aux_bc, MPI_REAL4, np_bc, mpi_comm_cols, mpierr) +#endif ! Insert what we got in aux_mat @@ -844,15 +954,24 @@ module ELPA1_compute if (lcs<=lce) then allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce)) if (lrs<=lre) then - call dgemm('T','N',nstor,lce-lcs+1,lre-lrs+1,1.d0,aux_mat(lrs,1),ubound(aux_mat,dim=1), & - b(lrs,lcs),ldb,0.d0,tmp1,nstor) +#ifdef DOUBLE_PRECISION_REAL + call dgemm('T', 'N', nstor, lce-lcs+1, lre-lrs+1, 1.0_rk, aux_mat(lrs,1), ubound(aux_mat,dim=1), & + b(lrs,lcs), ldb, 0.0_rk, tmp1, nstor) +#else + call sgemm('T', 'N', nstor, lce-lcs+1, lre-lrs+1, 1.0_rk, aux_mat(lrs,1), ubound(aux_mat,dim=1), & + b(lrs,lcs), ldb, 0.0_rk, tmp1, nstor) +#endif + else tmp1 = 0 endif ! Sum up the results and send to processor row np - call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_REAL8,MPI_SUM,np,mpi_comm_rows,mpierr) - +#ifdef DOUBLE_PRECISION_REAL + call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL8, MPI_SUM, np, mpi_comm_rows, mpierr) +#else + call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL4, MPI_SUM, np, mpi_comm_rows, mpierr) +#endif ! Put the result into C if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce) @@ -873,6 +992,8 @@ module ELPA1_compute end subroutine mult_at_b_real +#ifdef DOUBLE_PRECISION_COMPLEX + #define DATATYPE COMPLEX(kind=ck) #define BYTESIZE 16 #define COMPLEXCASE 1 @@ -882,6 +1003,18 @@ module ELPA1_compute #undef BYTESIZE #undef COMPLEXCASE +#else /* DOUBLE_PRECISION_COMPLEX */ + +#define DATATYPE COMPLEX(kind=ck) +#define BYTESIZE 8 +#define COMPLEXCASE 1 +#include "elpa_transpose_vectors.X90" +#include "elpa_reduce_add_vectors.X90" +#undef DATATYPE +#undef BYTESIZE +#undef COMPLEXCASE + +#endif /* DOUBLE_PRECISION_COMPLEX */ subroutine tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) !------------------------------------------------------------------------------- @@ -930,7 +1063,7 @@ module ELPA1_compute integer(kind=ik), parameter :: max_stored_rows = 32 - complex(kind=ck), parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0) + complex(kind=ck), parameter :: CZERO = (0.0_rk,0.0_rk), CONE = (1.0_rk,0.0_rk) integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols @@ -1028,8 +1161,13 @@ module ELPA1_compute vr(1:l_rows) = a(1:l_rows,l_cols+1) if (nstor>0 .and. l_rows>0) then aux(1:2*nstor) = conjg(uvc(l_cols+1,1:2*nstor)) - call ZGEMV('N',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1), & - aux,1,CONE,vr,1) +#ifdef DOUBLE_PRECISION_COMPLEX + call ZGEMV('N', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), & + aux, 1, CONE, vr, 1) +#else + call CGEMV('N', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), & + aux, 1, CONE, vr, 1) +#endif endif if (my_prow==prow(istep-1, nblk, np_rows)) then @@ -1039,9 +1177,11 @@ module ELPA1_compute aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows)) aux1(2) = 0. endif - - call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) - +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_allreduce(aux1, aux2, 2, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#else + call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#endif vnorm2 = aux2(1) vrl = aux2(2) @@ -1063,7 +1203,11 @@ module ELPA1_compute ! Broadcast the Householder vector (and tau) along columns if (my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep) - call MPI_Bcast(vr,l_rows+1,MPI_DOUBLE_COMPLEX,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr) +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(vr, l_rows+1, MPI_DOUBLE_COMPLEX, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call MPI_Bcast(vr, l_rows+1, MPI_COMPLEX, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) +#endif tau(istep) = vr(l_rows+1) ! Transpose Householder vector vr -> vc @@ -1111,14 +1255,34 @@ module ELPA1_compute if (lre0) then - call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1),vr,1,CZERO,aux,1) - call ZGEMV('N',l_cols,2*nstor,CONE,uvc,ubound(uvc,dim=1),aux,1,CONE,uc,1) +#ifdef DOUBLE_PRECISION_COMPLEX + call ZGEMV('C', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), vr, 1, CZERO, aux, 1) + call ZGEMV('N', l_cols, 2*nstor, CONE, uvc, ubound(uvc,dim=1), aux, 1, CONE, uc, 1) +#else + call CGEMV('C', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), vr, 1, CZERO, aux, 1) + call CGEMV('N', l_cols, 2*nstor, CONE, uvc, ubound(uvc,dim=1), aux, 1, CONE, uc, 1) +#endif endif endif @@ -1156,7 +1325,12 @@ module ELPA1_compute if (l_cols>0) then tmp(1:l_cols) = uc(1:l_cols) - call mpi_allreduce(tmp,uc,l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_allreduce(tmp, uc, l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#else + call mpi_allreduce(tmp, uc, l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#endif + endif ! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, & @@ -1173,7 +1347,11 @@ module ELPA1_compute xc = 0 if (l_cols>0) xc = dot_product(vc(1:l_cols),uc(1:l_cols)) - call mpi_allreduce(xc,vav,1,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_cols,mpierr) +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_allreduce(xc, vav, 1 , MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_cols, mpierr) +#else + call mpi_allreduce(xc, vav, 1 , MPI_COMPLEX, MPI_SUM, mpi_comm_cols, mpierr) +#endif ! store u and v in the matrices U and V ! these matrices are stored combined in one here @@ -1199,9 +1377,15 @@ module ELPA1_compute lrs = 1 lre = min(l_rows,(i+1)*l_rows_tile) if (lce0) & - call MPI_Bcast(hvb,nb,MPI_DOUBLE_COMPLEX,cur_pcol,mpi_comm_cols,mpierr) - +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(hvb, nb, MPI_DOUBLE_COMPLEX, cur_pcol, mpi_comm_cols, mpierr) +#else + call MPI_Bcast(hvb, nb, MPI_COMPLEX, cur_pcol, mpi_comm_cols, mpierr) +#endif nb = 0 do ic=ics,ice l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector @@ -1392,22 +1599,32 @@ module ELPA1_compute tmat = 0 if (l_rows>0) & - call zherk('U','C',nstor,l_rows,CONE,hvm,ubound(hvm,dim=1),CZERO,tmat,max_stored_rows) - +#ifdef DOUBLE_PRECISION_COMPLEX + call zherk('U', 'C', nstor, l_rows, CONE, hvm, ubound(hvm,dim=1), CZERO, tmat, max_stored_rows) +#else + call cherk('U', 'C', nstor, l_rows, CONE, hvm, ubound(hvm,dim=1), CZERO, tmat, max_stored_rows) +#endif nc = 0 do n=1,nstor-1 h1(nc+1:nc+n) = tmat(1:n,n+1) nc = nc+n enddo - - if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) +#ifdef DOUBLE_PRECISION_COMPLEX + if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#else + if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#endif ! Calculate triangular matrix T nc = 0 tmat(1,1) = tau(ice-nstor+1) do n=1,nstor-1 - call ztrmv('L','C','N',n,tmat,max_stored_rows,h2(nc+1),1) +#ifdef DOUBLE_PRECISION_COMPLEX + call ztrmv('L', 'C', 'N', n, tmat, max_stored_rows, h2(nc+1),1) +#else + call ctrmv('L', 'C', 'N', n, tmat, max_stored_rows, h2(nc+1),1) +#endif tmat(n+1,1:n) = -conjg(h2(nc+1:nc+n))*tau(ice-nstor+n+1) tmat(n+1,n+1) = tau(ice-nstor+n+1) nc = nc+n @@ -1416,17 +1633,31 @@ module ELPA1_compute ! Q = Q - V * T * V**T * Q if (l_rows>0) then - call zgemm('C','N',nstor,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), & - q,ldq,CZERO,tmp1,nstor) +#ifdef DOUBLE_PRECISION_COMPLEX + call zgemm('C', 'N', nstor, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), & + q, ldq, CZERO, tmp1 ,nstor) +#else + call cgemm('C', 'N', nstor, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), & + q, ldq, CZERO, tmp1 ,nstor) +#endif else tmp1(1:l_cols*nstor) = 0 endif - call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) if (l_rows>0) then - call ztrmm('L','L','N','N',nstor,l_cols,CONE,tmat,max_stored_rows,tmp2,nstor) - call zgemm('N','N',l_rows,l_cols,nstor,-CONE,hvm,ubound(hvm,dim=1), & - tmp2,nstor,CONE,q,ldq) + call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor) + call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), & + tmp2, nstor, CONE, q, ldq) endif +#else + call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) + if (l_rows>0) then + call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor) + call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), & + tmp2, nstor, CONE, q, ldq) + endif +#endif nstor = 0 endif @@ -1624,8 +1855,11 @@ module ELPA1_compute enddo ! Broadcast block column - - call MPI_Bcast(aux_bc,n_aux_bc,MPI_DOUBLE_COMPLEX,np_bc,mpi_comm_cols,mpierr) +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(aux_bc, n_aux_bc, MPI_DOUBLE_COMPLEX, np_bc, mpi_comm_cols, mpierr) +#else + call MPI_Bcast(aux_bc, n_aux_bc, MPI_COMPLEX, np_bc, mpi_comm_cols, mpierr) +#endif ! Insert what we got in aux_mat @@ -1659,15 +1893,23 @@ module ELPA1_compute if (lcs<=lce) then allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce)) if (lrs<=lre) then - call zgemm('C','N',nstor,lce-lcs+1,lre-lrs+1,(1.d0,0.d0),aux_mat(lrs,1),ubound(aux_mat,dim=1), & - b(lrs,lcs),ldb,(0.d0,0.d0),tmp1,nstor) +#ifdef DOUBLE_PRECISION_COMPLEX + call zgemm('C', 'N', nstor, lce-lcs+1, lre-lrs+1, (1.0_rk,0.0_rk), aux_mat(lrs,1), ubound(aux_mat,dim=1), & + b(lrs,lcs), ldb, (0.0_rk,0.0_rk), tmp1, nstor) +#else + call cgemm('C', 'N', nstor, lce-lcs+1, lre-lrs+1, (1.0_rk,0.0_rk), aux_mat(lrs,1), ubound(aux_mat,dim=1), & + b(lrs,lcs), ldb, (0.0_rk,0.0_rk), tmp1, nstor) +#endif else tmp1 = 0 endif ! Sum up the results and send to processor row np - call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_DOUBLE_COMPLEX,MPI_SUM,np,mpi_comm_rows,mpierr) - +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_DOUBLE_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr) +#else + call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr) +#endif ! Put the result into C if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce) @@ -1768,8 +2010,10 @@ module ELPA1_compute else nev1 = MIN(nev,l_cols) endif + call solve_tridi_col(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, & matrixCols, mpi_comm_rows, wantDebug, success) + if (.not.(success)) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi") @@ -1875,20 +2119,37 @@ module ELPA1_compute if (my_pcol==np_off) then do n=np_off+np1,np_off+nprocs-1 - call mpi_send(d(noff+1),nmid,MPI_REAL8,n,1,mpi_comm_cols,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_send(d(noff+1), nmid, MPI_REAL8, n, 1, mpi_comm_cols, mpierr) +#else + call mpi_send(d(noff+1), nmid, MPI_REAL4, n, 1, mpi_comm_cols, mpierr) +#endif enddo endif if (my_pcol>=np_off+np1 .and. my_pcol=np_off .and. my_pcol 1d-14) then + if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk) then write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1) else write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1) @@ -2320,7 +2594,7 @@ module ELPA1_compute ! Calculations start here beta = abs(e) - sig = sign(1.d0,e) + sig = sign(1.0_rk,e) ! Calculate rank-1 modifier z @@ -2345,14 +2619,17 @@ module ELPA1_compute ! Normalize z so that norm(z) = 1. Since z is the concatenation of ! two normalized vectors, norm2(z) = sqrt(2). - z = z/sqrt(2.0d0) + z = z/sqrt(2.0_rk) rho = 2.*beta ! Calculate index for merging both systems by ascending eigenvalues - +#ifdef DOUBLE_PRECISION_REAL call DLAMRG( nm, na-nm, d, 1, 1, idx ) +#else + call SLAMRG( nm, na-nm, d, 1, 1, idx ) +#endif - ! Calculate the allowable deflation tolerance +! Calculate the allowable deflation tolerance zmax = maxval(abs(z)) dmax = maxval(abs(d)) @@ -2510,9 +2787,13 @@ module ELPA1_compute if (na1==1) then d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation else ! na1==2 +#ifdef DOUBLE_PRECISION_REAL call DLAED5(1, d1, z1, qtrans(1,1), rho, d(1)) call DLAED5(2, d1, z1, qtrans(1,2), rho, d(2)) - +#else + call SLAED5(1, d1, z1, qtrans(1,1), rho, d(1)) + call SLAED5(2, d1, z1, qtrans(1,2), rho, d(2)) +#endif call transform_columns(idx1(1), idx1(2)) endif @@ -2520,9 +2801,11 @@ module ELPA1_compute d(na1+1:na) = d2(1:na2) ! Calculate arrangement of all eigenvalues in output - +#ifdef DOUBLE_PRECISION_REAL call DLAMRG( na1, na-na1, d, 1, 1, idx ) - +#else + call SLAMRG( na1, na-na1, d, 1, 1, idx ) +#endif ! Rearrange eigenvalues tmp = d @@ -2565,9 +2848,11 @@ module ELPA1_compute !$OMP DO #endif DO i = my_proc+1, na1, n_procs ! work distributed over all processors - +#ifdef DOUBLE_PRECISION_REAL call DLAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used! - +#else + call SLAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used! +#endif if (info/=0) then ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2) ! use the more stable bisection algorithm in solve_secular_equation @@ -2624,7 +2909,7 @@ module ELPA1_compute ! Calculate scale factors for eigenvectors - ev_scale(:) = 0. + ev_scale(:) = 0._rk #ifdef WITH_OPENMP @@ -2663,9 +2948,11 @@ module ELPA1_compute d(na1+1:na) = d2(1:na2) ! Calculate arrangement of all eigenvalues in output - +#ifdef DOUBLE_PRECISION_REAL call DLAMRG( na1, na-na1, d, 1, 1, idx ) - +#else + call SLAMRG( na1, na-na1, d, 1, 1, idx ) +#endif ! Rearrange eigenvalues tmp = d @@ -2758,10 +3045,15 @@ module ELPA1_compute else np_rem = np_rem-1 endif - +#ifdef DOUBLE_PRECISION_REAL call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL8, & np_next, 1111, np_prev, 1111, & mpi_comm_cols, mpi_status, mpierr) +#else + call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL4, & + np_next, 1111, np_prev, 1111, & + mpi_comm_cols, mpi_status, mpierr) +#endif endif ! Gather the parts in d1 and z which are fitting to qtmp1. @@ -2823,9 +3115,13 @@ module ELPA1_compute ! Multiply old Q with eigenvectors (upper half) if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) & - call dgemm('N','N',l_rnm,ncnt,nnzu,1.d0,qtmp1,ubound(qtmp1,dim=1),ev,ubound(ev,dim=1), & - 1.d0,qtmp2(1,1),ubound(qtmp2,dim=1)) - +#ifdef DOUBLE_PRECISION_REAL + call dgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), & + 1.0_rk, qtmp2(1,1), ubound(qtmp2,dim=1)) +#else + call sgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), & + 1.0_rk, qtmp2(1,1), ubound(qtmp2,dim=1)) +#endif ! Compute eigenvectors of the rank-1 modified matrix. ! Parts for multiplying with lower half of Q: @@ -2841,8 +3137,13 @@ module ELPA1_compute ! Multiply old Q with eigenvectors (lower half) if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) & - call dgemm('N','N',l_rows-l_rnm,ncnt,nnzl,1.d0,qtmp1(l_rnm+1,1),ubound(qtmp1,dim=1),ev,ubound(ev,dim=1), & - 1.d0,qtmp2(l_rnm+1,1),ubound(qtmp2,dim=1)) +#ifdef DOUBLE_PRECISION_REAL + call dgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, & + ubound(ev,dim=1), 1.0_rk, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1)) +#else + call sgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, & + ubound(ev,dim=1), 1.0_rk, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1)) +#endif ! Put partial result into (output) Q @@ -2883,7 +3184,7 @@ module ELPA1_compute tmp(1:na1) = z(1:na1) / tmp(1:na1) - ev_scale_value = 1.0/sqrt(dot_product(tmp(1:na1),tmp(1:na1))) + ev_scale_value = 1.0_rk/sqrt(dot_product(tmp(1:na1),tmp(1:na1))) end subroutine add_tmp @@ -2921,10 +3222,18 @@ module ELPA1_compute ! send and recieve column are local qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1) else - call mpi_send(q(l_rqs,lc1),l_rows,MPI_REAL8,pc2,mod(i,4096),mpi_comm_cols,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_send(q(l_rqs,lc1), l_rows, MPI_REAL8, pc2, mod(i,4096), mpi_comm_cols, mpierr) +#else + call mpi_send(q(l_rqs,lc1), l_rows, MPI_REAL4, pc2, mod(i,4096), mpi_comm_cols, mpierr) +#endif endif else if (pc2==my_pcol) then - call mpi_recv(qtmp(1,nc),l_rows,MPI_REAL8,pc1,mod(i,4096),mpi_comm_cols,mpi_status,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_recv(qtmp(1,nc), l_rows, MPI_REAL8, pc1, mod(i,4096), mpi_comm_cols, mpi_status, mpierr) +#else + call mpi_recv(qtmp(1,nc), l_rows, MPI_REAL4, pc1, mod(i,4096), mpi_comm_cols, mpi_status, mpierr) +#endif endif enddo @@ -2968,15 +3277,27 @@ module ELPA1_compute q(l_rqs:l_rqe,lc2) = q(l_rqs:l_rqe,lc1)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) q(l_rqs:l_rqe,lc1) = tmp(1:l_rows) else - call mpi_sendrecv(q(l_rqs,lc1),l_rows,MPI_REAL8,pc2,1, & - tmp,l_rows,MPI_REAL8,pc2,1, & - mpi_comm_cols,mpi_status,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_sendrecv(q(l_rqs,lc1), l_rows, MPI_REAL8, pc2, 1, & + tmp, l_rows, MPI_REAL8, pc2, 1, & + mpi_comm_cols, mpi_status, mpierr) +#else + call mpi_sendrecv(q(l_rqs,lc1), l_rows, MPI_REAL4, pc2, 1, & + tmp, l_rows, MPI_REAL4, pc2, 1, & + mpi_comm_cols, mpi_status, mpierr) +#endif q(l_rqs:l_rqe,lc1) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + tmp(1:l_rows)*qtrans(2,1) endif else if (pc2==my_pcol) then - call mpi_sendrecv(q(l_rqs,lc2),l_rows,MPI_REAL8,pc1,1, & - tmp,l_rows,MPI_REAL8,pc1,1, & - mpi_comm_cols,mpi_status,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_sendrecv(q(l_rqs,lc2), l_rows, MPI_REAL8, pc1, 1, & + tmp, l_rows, MPI_REAL8, pc1, 1, & + mpi_comm_cols, mpi_status, mpierr) +#else + call mpi_sendrecv(q(l_rqs,lc2), l_rows, MPI_REAL4, pc1, 1, & + tmp, l_rows, MPI_REAL4, pc1, 1, & + mpi_comm_cols, mpi_status, mpierr) +#endif q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) endif @@ -2998,9 +3319,11 @@ module ELPA1_compute if (npc_n==1 .and. np_rows==1) return ! nothing to do ! Do an mpi_allreduce over processor rows - +#ifdef DOUBLE_PRECISION_REAL call mpi_allreduce(z, tmp, n, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) - +#else + call mpi_allreduce(z, tmp, n, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) +#endif ! If only 1 processor column, we are done if (npc_n==1) then z(:) = tmp(:) @@ -3009,7 +3332,11 @@ module ELPA1_compute ! If all processor columns are involved, we can use mpi_allreduce if (npc_n==np_cols) then +#ifdef DOUBLE_PRECISION_REAL call mpi_allreduce(tmp, z, n, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) +#else + call mpi_allreduce(tmp, z, n, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr) +#endif return endif @@ -3017,8 +3344,13 @@ module ELPA1_compute z(:) = 0 do np = 1, npc_n z(:) = z(:) + tmp(:) +#ifdef DOUBLE_PRECISION_REAL call MPI_Sendrecv_replace(z, n, MPI_REAL8, np_next, 1111, np_prev, 1111, & mpi_comm_cols, mpi_status, mpierr) +#else + call MPI_Sendrecv_replace(z, n, MPI_REAL4, np_next, 1111, np_prev, 1111, & + mpi_comm_cols, mpi_status, mpierr) +#endif enddo end subroutine global_gather @@ -3037,9 +3369,11 @@ module ELPA1_compute if (npc_n==1 .and. np_rows==1) return ! nothing to do ! Do an mpi_allreduce over processor rows - +#ifdef DOUBLE_PRECISION_REAL call mpi_allreduce(z, tmp, n, MPI_REAL8, MPI_PROD, mpi_comm_rows, mpierr) - +#else + call mpi_allreduce(z, tmp, n, MPI_REAL4, MPI_PROD, mpi_comm_rows, mpierr) +#endif ! If only 1 processor column, we are done if (npc_n==1) then z(:) = tmp(:) @@ -3048,7 +3382,11 @@ module ELPA1_compute ! If all processor columns are involved, we can use mpi_allreduce if (npc_n==np_cols) then +#ifdef DOUBLE_PRECISION_REAL call mpi_allreduce(tmp, z, n, MPI_REAL8, MPI_PROD, mpi_comm_cols, mpierr) +#else + call mpi_allreduce(tmp, z, n, MPI_REAL4, MPI_PROD, mpi_comm_cols, mpierr) +#endif return endif @@ -3058,15 +3396,28 @@ module ELPA1_compute if (my_pcol == npc_0) then z(1:n) = tmp(1:n) do np = npc_0+1, npc_0+npc_n-1 - call mpi_recv(tmp,n,MPI_REAL8,np,1111,mpi_comm_cols,mpi_status,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_recv(tmp, n, MPI_REAL8, np, 1111, mpi_comm_cols, mpi_status, mpierr) +#else + call mpi_recv(tmp, n, MPI_REAL4, np, 1111, mpi_comm_cols, mpi_status, mpierr) +#endif z(1:n) = z(1:n)*tmp(1:n) enddo do np = npc_0+1, npc_0+npc_n-1 - call mpi_send(z,n,MPI_REAL8,np,1111,mpi_comm_cols,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_send(z, n, MPI_REAL8, np, 1111, mpi_comm_cols, mpierr) +#else + call mpi_send(z, n, MPI_REAL4, np, 1111, mpi_comm_cols, mpierr) +#endif enddo else - call mpi_send(tmp,n,MPI_REAL8,npc_0,1111,mpi_comm_cols,mpierr) - call mpi_recv(z ,n,MPI_REAL8,npc_0,1111,mpi_comm_cols,mpi_status,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_send(tmp, n, MPI_REAL8, npc_0, 1111, mpi_comm_cols, mpierr) + call mpi_recv(z ,n, MPI_REAL8, npc_0, 1111, mpi_comm_cols, mpi_status, mpierr) +#else + call mpi_send(tmp, n, MPI_REAL4, npc_0, 1111, mpi_comm_cols, mpierr) + call mpi_recv(z ,n, MPI_REAL4, npc_0, 1111, mpi_comm_cols, mpi_status, mpierr) +#endif endif end subroutine global_product @@ -3111,7 +3462,7 @@ module ELPA1_compute use precision implicit none - real(kind=rk) :: g_col(nlen), l_col(*) ! chnage this to proper 2d 1d matching + real(kind=rk) :: g_col(nlen), l_col(*) ! chnage this to proper 2d 1d matching ! remove assumed size integer(kind=ik) :: noff, nlen, my_prow, np_rows, nblk integer(kind=ik) :: nbs, nbe, jb, g_off, l_off, js, je @@ -3216,7 +3567,7 @@ module ELPA1_compute delta(:) = d(:) - dshift a = 0. ! delta(n) - b = rho*SUM(z(:)**2) + 1. ! rho*SUM(z(:)**2) is the lower bound for the guess + b = rho*SUM(z(:)**2) + 1._rk ! rho*SUM(z(:)**2) is the lower bound for the guess else @@ -3224,8 +3575,8 @@ module ELPA1_compute ! We check the sign of the function in the midpoint of the interval ! in order to determine if eigenvalue is more close to d(i) or d(i+1) - x = 0.5*(d(i)+d(i+1)) - y = 1. + rho*SUM(z(:)**2/(d(:)-x)) + x = 0.5_rk*(d(i)+d(i+1)) + y = 1._rk + rho*SUM(z(:)**2/(d(:)-x)) if (y>0) then ! solution is next to d(i) @@ -3247,11 +3598,14 @@ module ELPA1_compute ! Interval subdivision - x = 0.5*(a+b) + x = 0.5_rk*(a+b) if (x==a .or. x==b) exit ! No further interval subdivisions possible - if (abs(x) < 1.d-200) exit ! x next to pole - +#ifdef DOUBLE_PRECISION_REAL + if (abs(x) < 1.e-200_rk) exit ! x next to pole +#else + if (abs(x) < 1.e-20_rk) exit ! x next to pole +#endif ! evaluate value at x y = 1. + rho*SUM(z(:)**2/(delta(:)-x)) @@ -3432,8 +3786,11 @@ module ELPA1_compute ! of the remaining block if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then - - call dpotrf('U',na-n+1,a(l_row1,l_col1),lda,info) +#ifdef DOUBLE_PRECISION_REAL + call dpotrf('U', na-n+1, a(l_row1,l_col1), lda, info) +#else + call spotrf('U', na-n+1, a(l_row1,l_col1), lda, info) +#endif if (info/=0) then if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf" success = .false. @@ -3452,8 +3809,11 @@ module ELPA1_compute ! The process owning the upper left remaining block does the ! Cholesky-Factorization of this block - - call dpotrf('U',nblk,a(l_row1,l_col1),lda,info) +#ifdef DOUBLE_PRECISION_REAL + call dpotrf('U', nblk, a(l_row1,l_col1), lda, info) +#else + call spotrf('U', nblk, a(l_row1,l_col1), lda, info) +#endif if (info/=0) then if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf" success = .false. @@ -3466,9 +3826,11 @@ module ELPA1_compute nc = nc+i enddo endif - - call MPI_Bcast(tmp1,nblk*(nblk+1)/2,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr) - +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#endif nc = 0 do i=1,nblk tmp2(1:i,i) = tmp1(nc+1:nc+i) @@ -3476,15 +3838,22 @@ module ELPA1_compute enddo if (l_cols-l_colx+1>0) & - call dtrsm('L','U','T','N',nblk,l_cols-l_colx+1,1.d0,tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda) - +#ifdef DOUBLE_PRECISION_REAL + call dtrsm('L', 'U', 'T', 'N', nblk, l_cols-l_colx+1, 1.0_rk, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) +#else + call strsm('L', 'U', 'T', 'N', nblk, l_cols-l_colx+1, 1.0_rk, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) +#endif endif do i=1,nblk if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols) if (l_cols-l_colx+1>0) & - call MPI_Bcast(tmatc(l_colx,i),l_cols-l_colx+1,MPI_REAL8,prow(n, nblk, np_rows),mpi_comm_rows,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_REAL8, prow(n, nblk, np_rows), mpi_comm_rows, mpierr) +#else + call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_REAL4, prow(n, nblk, np_rows), mpi_comm_rows, mpierr) +#endif enddo ! this has to be checked since it was changed substantially when doing type safe @@ -3498,9 +3867,15 @@ module ELPA1_compute lrs = l_rowx lre = min(l_rows,(i+1)*l_rows_tile) if (lce0) & - call DTRMM('L','U','N','N',nb,l_cols-l_colx+1,1.d0,tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda) - +#ifdef DOUBLE_PRECISION_REAL + call DTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, 1.0_rk, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) +#else + call STRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, 1.0_rk, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) +#endif if (l_colx<=l_cols) tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols) if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0 @@ -3640,18 +4023,30 @@ module ELPA1_compute endif do i=1,nb - call MPI_Bcast(tmat1(1,i),l_row1-1,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#endif enddo endif if (l_cols-l_col1+1>0) & - call MPI_Bcast(tmat2(1,l_col1),(l_cols-l_col1+1)*nblk,MPI_REAL8,prow(n, nblk, np_rows),mpi_comm_rows,mpierr) - +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_REAL8, prow(n, nblk, np_rows), mpi_comm_rows, mpierr) +#else + call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_REAL4, prow(n, nblk, np_rows), mpi_comm_rows, mpierr) +#endif if (l_row1>1 .and. l_cols-l_col1+1>0) & - call dgemm('N','N',l_row1-1,l_cols-l_col1+1,nb, -1.d0, & - tmat1,ubound(tmat1,dim=1),tmat2(1,l_col1),ubound(tmat2,dim=1), & - 1.d0, a(1,l_col1),lda) - +#ifdef DOUBLE_PRECISION_REAL + call dgemm('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -1.0_rk, & + tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), & + 1.0_rk, a(1,l_col1), lda) +#else + call sgemm('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -1.0_rk, & + tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), & + 1.0_rk, a(1,l_col1), lda) +#endif enddo deallocate(tmp1, tmp2, tmat1, tmat2) @@ -3753,8 +4148,11 @@ module ELPA1_compute ! of the remaining block if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then - - call zpotrf('U',na-n+1,a(l_row1,l_col1),lda,info) +#ifdef DOUBLE_PRECISION_COMPLEX + call zpotrf('U', na-n+1, a(l_row1,l_col1),lda, info) +#else + call cpotrf('U', na-n+1, a(l_row1,l_col1),lda, info) +#endif if (info/=0) then if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf" success = .false. @@ -3772,8 +4170,11 @@ module ELPA1_compute ! The process owning the upper left remaining block does the ! Cholesky-Factorization of this block - - call zpotrf('U',nblk,a(l_row1,l_col1),lda,info) +#ifdef DOUBLE_PRECISION_COMPLEX + call zpotrf('U', nblk, a(l_row1,l_col1),lda, info) +#else + call cpotrf('U', nblk, a(l_row1,l_col1),lda, info) +#endif if (info/=0) then if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf" success = .false. @@ -3786,9 +4187,11 @@ module ELPA1_compute nc = nc+i enddo endif - - call MPI_Bcast(tmp1,nblk*(nblk+1)/2,MPI_DOUBLE_COMPLEX,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr) - +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_DOUBLE_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#endif nc = 0 do i=1,nblk tmp2(1:i,i) = tmp1(nc+1:nc+i) @@ -3796,16 +4199,26 @@ module ELPA1_compute enddo if (l_cols-l_colx+1>0) & - call ztrsm('L','U','C','N',nblk,l_cols-l_colx+1,(1.d0,0.d0),tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda) - +#ifdef DOUBLE_PRECISION_COMPLEX + call ztrsm('L', 'U', 'C', 'N', nblk, l_cols-l_colx+1, (1.0_rk,0.0_rk), tmp2, ubound(tmp2,dim=1), & + a(l_row1,l_colx), lda) +#else + call ctrsm('L', 'U', 'C', 'N', nblk, l_cols-l_colx+1, (1.0_rk,0.0_rk), tmp2, ubound(tmp2,dim=1), & + a(l_row1,l_colx), lda) +#endif endif do i=1,nblk if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = conjg(a(l_row1+i-1,l_colx:l_cols)) if (l_cols-l_colx+1>0) & - call MPI_Bcast(tmatc(l_colx,i),l_cols-l_colx+1,MPI_DOUBLE_COMPLEX,prow(n, nblk, np_rows),mpi_comm_rows,mpierr) - +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_DOUBLE_COMPLEX, prow(n, nblk, np_rows), & + mpi_comm_rows, mpierr) +#else + call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_COMPLEX, prow(n, nblk, np_rows), & + mpi_comm_rows, mpierr) +#endif enddo ! this has to be checked since it was changed substantially when doing type safe call elpa_transpose_vectors_complex (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, & @@ -3817,9 +4230,15 @@ module ELPA1_compute lrs = l_rowx lre = min(l_rows,(i+1)*l_rows_tile) if (lce0) & - call ZTRMM('L','U','N','N',nb,l_cols-l_colx+1,(1.d0,0.d0),tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda) - +#ifdef DOUBLE_PRECISION_COMPLEX + call ZTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, (1.0_rk,0.0_rk), tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) +#else + call CTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, (1.0_rk,0.0_rk), tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) +#endif if (l_colx<=l_cols) tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols) if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0 @@ -3958,18 +4385,32 @@ module ELPA1_compute endif do i=1,nb - call MPI_Bcast(tmat1(1,i),l_row1-1,MPI_DOUBLE_COMPLEX,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr) +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_DOUBLE_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#endif enddo endif if (l_cols-l_col1+1>0) & - call MPI_Bcast(tmat2(1,l_col1),(l_cols-l_col1+1)*nblk,MPI_DOUBLE_COMPLEX,prow(n, nblk, np_rows),mpi_comm_rows,mpierr) - +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_DOUBLE_COMPLEX, prow(n, nblk, np_rows), & + mpi_comm_rows, mpierr) +#else + call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_COMPLEX, prow(n, nblk, np_rows), & + mpi_comm_rows, mpierr) +#endif if (l_row1>1 .and. l_cols-l_col1+1>0) & - call ZGEMM('N','N',l_row1-1,l_cols-l_col1+1,nb, (-1.d0,0.d0), & - tmat1,ubound(tmat1,dim=1),tmat2(1,l_col1),ubound(tmat2,dim=1), & - (1.d0,0.d0), a(1,l_col1),lda) - +#ifdef DOUBLE_PRECISION_COMPLEX + call ZGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk,0.0_rk), & + tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), & + (1.0_rk,0.0_rk), a(1,l_col1), lda) +#else + call CGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk,0.0_rk), & + tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), & + (1.0_rk,0.0_rk), a(1,l_col1), lda) +#endif enddo deallocate(tmp1, tmp2, tmat1, tmat2) @@ -4047,9 +4488,12 @@ module ELPA1_compute real(kind=rk) :: ALPHR, ALPHI, BETA - ALPHR = DBLE( ALPHA ) + ALPHR = real( ALPHA, kind=rk ) +#ifdef DOUBLE_PRECISION_COMPLEX ALPHI = DIMAG( ALPHA ) - +#else + ALPHI = AIMAG( ALPHA ) +#endif if ( XNORM_SQ==0. .AND. ALPHI==0. ) then if ( ALPHR>=0. ) then @@ -4068,10 +4512,15 @@ module ELPA1_compute BETA = -BETA TAU = -ALPHA / BETA ELSE - ALPHR = ALPHI * (ALPHI/DBLE( ALPHA )) - ALPHR = ALPHR + XNORM_SQ/DBLE( ALPHA ) + ALPHR = ALPHI * (ALPHI/real( ALPHA , kind=rk)) + ALPHR = ALPHR + XNORM_SQ/real( ALPHA, kind=rk ) +#ifdef DOUBLE_PRECISION_COMPLEX TAU = DCMPLX( ALPHR/BETA, -ALPHI/BETA ) ALPHA = DCMPLX( -ALPHR, ALPHI ) +#else + TAU = CMPLX( ALPHR/BETA, -ALPHI/BETA ) + ALPHA = CMPLX( -ALPHR, ALPHI ) +#endif END IF XF = 1./ALPHA ALPHA = BETA diff --git a/src/elpa_c_interface.F90 b/src/elpa_c_interface.F90 index c4d14e41..4da22bdd 100644 --- a/src/elpa_c_interface.F90 +++ b/src/elpa_c_interface.F90 @@ -125,19 +125,29 @@ !c> * !c> * \result int: 1 if error occured, otherwise 0 !c>*/ - !c> int elpa_solve_evp_real_1stage(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols); +#ifdef DOUBLE_PRECISION_REAL + !c> int elpa_solve_evp_real_1stage_double_precision(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols); +#else + !c> int elpa_solve_evp_real_1stage_single_precision(int na, int nev, float *a, int lda, float *ev, float *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols); +#endif function solve_elpa1_evp_real_wrapper(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols) & - result(success) bind(C,name="elpa_solve_evp_real_1stage") - +#ifdef DOUBLE_PRECISION_REAL + result(success) bind(C,name="elpa_solve_evp_real_1stage_double_precision") +#else + result(success) bind(C,name="elpa_solve_evp_real_1stage_single_precision") +#endif use, intrinsic :: iso_c_binding use elpa1, only : solve_evp_real implicit none integer(kind=c_int) :: success integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows +#ifdef DOUBLE_PRECISION_REAL real(kind=c_double) :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols) - +#else + real(kind=c_float) :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols) +#endif logical :: successFortran successFortran = solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) @@ -173,19 +183,32 @@ !c> * !c> * \result int: 1 if error occured, otherwise 0 !c> */ - !c> int elpa_solve_evp_complex_1stage(int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols); +#ifdef DOUBLE_PRECISION_COMPLEX + !c> int elpa_solve_evp_complex_1stage_double_precision(int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols); +#else + !c> int elpa_solve_evp_complex_1stage_single_precision(int na, int nev, complex *a, int lda, float *ev, complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols); +#endif + function solve_evp_real_wrapper(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols) & - result(success) bind(C,name="elpa_solve_evp_complex_1stage") - +#ifdef DOUBLE_PRECISION_COMPLEX + result(success) bind(C,name="elpa_solve_evp_complex_1stage_double_precision") +#else + result(success) bind(C,name="elpa_solve_evp_complex_1stage_single_precision") +#endif use, intrinsic :: iso_c_binding use elpa1, only : solve_evp_complex implicit none integer(kind=c_int) :: success integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows +#ifdef DOUBLE_PRECISION_COMPLEX complex(kind=c_double_complex) :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols) real(kind=c_double) :: ev(1:na) +#else + complex(kind=c_float_complex) :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols) + real(kind=c_float) :: ev(1:na) +#endif logical :: successFortran @@ -223,12 +246,20 @@ !c> * !c> * \result int: 1 if error occured, otherwise 0 !c> */ - !c> int elpa_solve_evp_real_2stage(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_REAL_ELPA_KERNEL_API, int useQR); +#ifdef DOUBLE_PRECISION_REAL + !c> int elpa_solve_evp_real_2stage_double_precision(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_REAL_ELPA_KERNEL_API, int useQR); +#else + !c> int elpa_solve_evp_real_2stage_single_precision(int na, int nev, float *a, int lda, float *ev, float *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_REAL_ELPA_KERNEL_API, int useQR); +#endif + function solve_elpa2_evp_real_wrapper(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, & THIS_REAL_ELPA_KERNEL_API, useQR) & - result(success) bind(C,name="elpa_solve_evp_real_2stage") - +#ifdef DOUBLE_PRECISION_REAL + result(success) bind(C,name="elpa_solve_evp_real_2stage_double_precision") +#else + result(success) bind(C,name="elpa_solve_evp_real_2stage_single_precision") +#endif use, intrinsic :: iso_c_binding use elpa2, only : solve_evp_real_2stage @@ -237,9 +268,11 @@ integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows, & mpi_comm_all integer(kind=c_int), value, intent(in) :: THIS_REAL_ELPA_KERNEL_API, useQR +#ifdef DOUBLE_PRECISION_REAL real(kind=c_double) :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols) - - +#else + real(kind=c_float) :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols) +#endif logical :: successFortran, useQRFortran @@ -287,11 +320,19 @@ !c> * !c> * \result int: 1 if error occured, otherwise 0 !c> */ - !c> int elpa_solve_evp_complex_2stage(int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_COMPLEX_ELPA_KERNEL_API); +#ifdef DOUBLE_PRECISION_COMPLEX + !c> int elpa_solve_evp_complex_2stage_double_precision(int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_COMPLEX_ELPA_KERNEL_API); +#else + !c> int elpa_solve_evp_complex_2stage_single_precision(int na, int nev, complex *a, int lda, float *ev, complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_COMPLEX_ELPA_KERNEL_API); +#endif function solve_elpa2_evp_complex_wrapper(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, & THIS_COMPLEX_ELPA_KERNEL_API) & - result(success) bind(C,name="elpa_solve_evp_complex_2stage") +#ifdef DOUBLE_PRECISION_COMPLEX + result(success) bind(C,name="elpa_solve_evp_complex_2stage_double_precision") +#else + result(success) bind(C,name="elpa_solve_evp_complex_2stage_single_precision") +#endif use, intrinsic :: iso_c_binding use elpa2, only : solve_evp_complex_2stage @@ -301,8 +342,13 @@ integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows, & mpi_comm_all integer(kind=c_int), value, intent(in) :: THIS_COMPLEX_ELPA_KERNEL_API +#ifdef DOUBLE_PRECISION_COMPLEX complex(kind=c_double_complex) :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols) real(kind=c_double) :: ev(1:na) +#else + complex(kind=c_float_complex) :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols) + real(kind=c_float) :: ev(1:na) +#endif logical :: successFortran successFortran = solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, & diff --git a/src/elpa_reduce_add_vectors.X90 b/src/elpa_reduce_add_vectors.X90 index 9651a92e..2b988141 100644 --- a/src/elpa_reduce_add_vectors.X90 +++ b/src/elpa_reduce_add_vectors.X90 @@ -49,6 +49,8 @@ ! distributed along with the original code in the file "COPYING". #endif +#include "config-f90.h" + #if REALCASE==1 subroutine elpa_reduce_add_vectors_real(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc,nblk) #endif @@ -146,13 +148,25 @@ subroutine elpa_reduce_add_vectors_complex(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t !$omp master #endif #if REALCASE==1 - if(k>0) call mpi_reduce(aux1,aux2,k,MPI_REAL8,MPI_SUM,ipt,comm_t,mpierr) + +#ifdef DOUBLE_PRECISION_REAL + if(k>0) call mpi_reduce(aux1, aux2, k, MPI_REAL8, MPI_SUM, ipt, comm_t, mpierr) +#else + if(k>0) call mpi_reduce(aux1, aux2, k, MPI_REAL4, MPI_SUM, ipt, comm_t, mpierr) #endif +#endif /* REALCASE==1 */ + #if COMPLEXCASE==1 - if(k>0) call mpi_reduce(aux1,aux2,k,MPI_DOUBLE_COMPLEX,MPI_SUM,ipt,comm_t,mpierr) + +#ifdef DOUBLE_PRECISION_COMPLEX + if(k>0) call mpi_reduce(aux1, aux2, k, MPI_DOUBLE_COMPLEX, MPI_SUM, ipt, comm_t, mpierr) +#else + if(k>0) call mpi_reduce(aux1, aux2, k, MPI_COMPLEX, MPI_SUM, ipt, comm_t, mpierr) #endif +#endif /* COMPLEXCASE == 1 */ + #ifdef WITH_OPENMP !$omp end master !$omp barrier diff --git a/src/elpa_transpose_vectors.X90 b/src/elpa_transpose_vectors.X90 index 94543a2d..827ffb9f 100644 --- a/src/elpa_transpose_vectors.X90 +++ b/src/elpa_transpose_vectors.X90 @@ -49,6 +49,8 @@ ! distributed along with the original code in the file "COPYING". #endif +#include "config-f90.h" + #if REALCASE==1 subroutine elpa_transpose_vectors_real(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,nvc,nblk) #endif @@ -155,12 +157,25 @@ subroutine elpa_transpose_vectors_complex(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t, !$omp master #endif #if COMPLEXCASE==1 - call MPI_Bcast(aux,nblks_comm*nblk*nvc,MPI_DOUBLE_COMPLEX,ips,comm_s,mpierr) + +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(aux, nblks_comm*nblk*nvc, MPI_DOUBLE_COMPLEX, ips, comm_s, mpierr) +#else + call MPI_Bcast(aux, nblks_comm*nblk*nvc, MPI_COMPLEX, ips, comm_s, mpierr) #endif +#endif /* DOUBLE_PRECISION_COMPLEX */ + #if REALCASE==1 - call MPI_Bcast(aux,nblks_comm*nblk*nvc,MPI_REAL8,ips,comm_s,mpierr) + +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(aux, nblks_comm*nblk*nvc, MPI_REAL8, ips, comm_s, mpierr) +#else + call MPI_Bcast(aux, nblks_comm*nblk*nvc, MPI_REAL4, ips, comm_s, mpierr) #endif + +#endif /* REALCASE == 1 */ + #ifdef WITH_OPENMP !$omp end master !$omp barrier diff --git a/src/mod_precision.f90 b/src/mod_precision.F90 similarity index 58% rename from src/mod_precision.f90 rename to src/mod_precision.F90 index 5b10e4eb..91895d47 100644 --- a/src/mod_precision.f90 +++ b/src/mod_precision.F90 @@ -1,9 +1,18 @@ +#include "config-f90.h" module precision - use iso_c_binding, only : C_FLOAT, C_DOUBLE, C_INT32_T, C_INT64_T + use iso_c_binding, only : C_FLOAT, C_DOUBLE, C_INT32_T, C_INT64_T, C_FLOAT implicit none +#ifdef DOUBLE_PRECISION_REAL integer, parameter :: rk = C_DOUBLE +#else + integer, parameter :: rk = C_FLOAT +#endif +#ifdef DOUBLE_PRECISION_COMPLEX integer, parameter :: ck = C_DOUBLE +#else + integer, parameter :: ck = C_FLOAT +#endif integer, parameter :: ik = C_INT32_T integer, parameter :: lik = C_INT64_T end module precision diff --git a/src/redist_band.X90 b/src/redist_band.X90 index 239feb21..77004f44 100644 --- a/src/redist_band.X90 +++ b/src/redist_band.X90 @@ -50,6 +50,9 @@ #endif ! -------------------------------------------------------------------------------------------------- ! redist_band: redistributes band from 2D block cyclic form to 1D band + +#include "config-f90.h" + #if REALCASE==1 subroutine redist_band_real(r_a, lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, r_ab) #endif @@ -248,12 +251,25 @@ subroutine redist_band_complex(c_a, lda, na, nblk, nbw, matrixCols, mpi_comm_row ! Exchange all data with MPI_Alltoallv #if REALCASE==1 - call MPI_Alltoallv(r_sbuf,ncnt_s,nstart_s,MPI_REAL8,r_rbuf,ncnt_r,nstart_r,MPI_REAL8,mpi_comm,mpierr) + +#ifdef DOUBLE_PRECISION_REAL + call MPI_Alltoallv(r_sbuf, ncnt_s, nstart_s, MPI_REAL8, r_rbuf, ncnt_r, nstart_r, MPI_REAL8, mpi_comm, mpierr) +#else + call MPI_Alltoallv(r_sbuf, ncnt_s, nstart_s, MPI_REAL4, r_rbuf, ncnt_r, nstart_r, MPI_REAL4, mpi_comm, mpierr) #endif + +#endif /* REALCASE==1 */ + #if COMPLEXCASE==1 - call MPI_Alltoallv(c_sbuf,ncnt_s,nstart_s,MPI_COMPLEX16,c_rbuf,ncnt_r,nstart_r,MPI_COMPLEX16,mpi_comm,mpierr) + +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Alltoallv(c_sbuf, ncnt_s, nstart_s, MPI_COMPLEX16, c_rbuf, ncnt_r, nstart_r, MPI_COMPLEX16, mpi_comm, mpierr) +#else + call MPI_Alltoallv(c_sbuf, ncnt_s, nstart_s, MPI_COMPLEX, c_rbuf, ncnt_r, nstart_r, MPI_COMPLEX, mpi_comm, mpierr) #endif +#endif /* COMPLEXCASE==1 */ + ! set band from receive buffer ncnt_r(:) = ncnt_r(:)/(nblk*nblk) diff --git a/test/c_test_programs/elpa1_test_complex_c_version.c b/test/c_test_programs/elpa1_test_complex_c_version.c index 3039a36c..8003818a 100644 --- a/test/c_test_programs/elpa1_test_complex_c_version.c +++ b/test/c_test_programs/elpa1_test_complex_c_version.c @@ -72,11 +72,19 @@ int main(int argc, char** argv) { int info, *sc_desc; int na_rows, na_cols; + double startVal; +#ifdef DOUBLE_PRECISION_COMPLEX complex double *a, *z, *as, *tmp1, *tmp2; double *ev, *xr; +#else + + complex *a, *z, *as, *tmp1, *tmp2; + + float *ev, *xr; +#endif int *iseed; @@ -103,6 +111,12 @@ int main(int argc, char** argv) { printf("\n"); +#ifdef DOUBLE_PRECISION_COMPLEX + printf("The double precision version of ELPA1 is used\n"); +#else + printf("The single precision version of ELPA1 is used\n"); +#endif + printf("\n"); } status = 0; @@ -161,6 +175,7 @@ int main(int argc, char** argv) { printf("\n"); } +#ifdef DOUBLE_PRECISION_COMPLEX a = malloc(na_rows*na_cols*sizeof(complex double)); z = malloc(na_rows*na_cols*sizeof(complex double)); as = malloc(na_rows*na_cols*sizeof(complex double)); @@ -172,10 +187,26 @@ int main(int argc, char** argv) { tmp1 = malloc(na_rows*na_cols*sizeof(complex double)); tmp2 = malloc(na_rows*na_cols*sizeof(complex double)); +#else + a = malloc(na_rows*na_cols*sizeof(complex)); + z = malloc(na_rows*na_cols*sizeof(complex)); + as = malloc(na_rows*na_cols*sizeof(complex)); - iseed = malloc(4096*sizeof(int)); + xr = malloc(na_rows*na_cols*sizeof(float)); - prepare_matrix_complex_from_fortran(na, myid, na_rows, na_cols, sc_desc, iseed, xr, a, z, as); + + ev = malloc(na*sizeof(float)); + + tmp1 = malloc(na_rows*na_cols*sizeof(complex)); + tmp2 = malloc(na_rows*na_cols*sizeof(complex)); +#endif + + iseed = malloc(4096*sizeof(int)); +#ifdef DOUBLE_PRECISION_COMPLEX + prepare_matrix_complex_from_fortran_double_precision(na, myid, na_rows, na_cols, sc_desc, iseed, xr, a, z, as); +#else + prepare_matrix_complex_from_fortran_single_precision(na, myid, na_rows, na_cols, sc_desc, iseed, xr, a, z, as); +#endif free(xr); @@ -187,8 +218,11 @@ int main(int argc, char** argv) { mpierr = MPI_Barrier(MPI_COMM_WORLD); - success = elpa_solve_evp_complex_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols); - +#ifdef DOUBLE_PRECISION_COMPLEX + success = elpa_solve_evp_complex_1stage_double_precision(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols); +#else + success = elpa_solve_evp_complex_1stage_single_precision(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols); +#endif if (success != 1) { printf("error in ELPA solve \n"); mpierr = MPI_Abort(MPI_COMM_WORLD, 99); @@ -202,7 +236,11 @@ int main(int argc, char** argv) { } /* check the results */ - status = check_correctness_complex_from_fortran(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2); +#ifdef DOUBLE_PRECISION_COMPLEX + status = check_correctness_complex_from_fortran_double_precision(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2); +#else + status = check_correctness_complex_from_fortran_single_precision(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2); +#endif if (status !=0){ printf("The computed EVs are not correct !\n"); diff --git a/test/c_test_programs/elpa1_test_real_c_version.c b/test/c_test_programs/elpa1_test_real_c_version.c index 547a3e76..46709e77 100644 --- a/test/c_test_programs/elpa1_test_real_c_version.c +++ b/test/c_test_programs/elpa1_test_real_c_version.c @@ -73,9 +73,11 @@ int main(int argc, char** argv) { int na_rows, na_cols; double startVal; - +#ifdef DOUBLE_PRECISION_REAL double *a, *z, *as, *ev, *tmp1, *tmp2; - +#else + float *a, *z, *as, *ev, *tmp1, *tmp2; +#endif int *iseed; int success; @@ -99,6 +101,12 @@ int main(int argc, char** argv) { printf("as it's Fortran counterpart. It's only purpose is to show how \n"); printf("to evoke ELPA1 from a c programm\n"); printf("\n"); +#ifdef DOUBLE_PRECISION_REAL + printf("The double precision version of ELPA1 is used\n"); +#else + printf("The single precision version of ELPA1 is used\n"); +#endif + printf("\n"); } @@ -157,7 +165,7 @@ int main(int argc, char** argv) { printf("Allocating matrices with na_rows=%d and na_cols=%d\n",na_rows, na_cols); printf("\n"); } - +#ifdef DOUBLE_PRECISION_REAL a = malloc(na_rows*na_cols*sizeof(double)); z = malloc(na_rows*na_cols*sizeof(double)); as = malloc(na_rows*na_cols*sizeof(double)); @@ -167,11 +175,24 @@ int main(int argc, char** argv) { tmp1 = malloc(na_rows*na_cols*sizeof(double)); tmp2 = malloc(na_rows*na_cols*sizeof(double)); +#else + a = malloc(na_rows*na_cols*sizeof(float)); + z = malloc(na_rows*na_cols*sizeof(float)); + as = malloc(na_rows*na_cols*sizeof(float)); - iseed = malloc(4096*sizeof(int)); - prepare_matrix_real_from_fortran(na, myid, na_rows, na_cols, sc_desc, iseed, a, z, as); + ev = malloc(na*sizeof(float)); + tmp1 = malloc(na_rows*na_cols*sizeof(float)); + tmp2 = malloc(na_rows*na_cols*sizeof(float)); +#endif + + iseed = malloc(4096*sizeof(int)); +#ifdef DOUBLE_PRECISION_REAL + prepare_matrix_real_from_fortran_double_precision(na, myid, na_rows, na_cols, sc_desc, iseed, a, z, as); +#else + prepare_matrix_real_from_fortran_single_precision(na, myid, na_rows, na_cols, sc_desc, iseed, a, z, as); +#endif if (myid == 0) { printf("\n"); printf("Entering ELPA 1stage real solver\n"); @@ -179,9 +200,11 @@ int main(int argc, char** argv) { } mpierr = MPI_Barrier(MPI_COMM_WORLD); - - success = elpa_solve_evp_real_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols); - +#ifdef DOUBLE_PRECISION_REAL + success = elpa_solve_evp_real_1stage_double_precision(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols); +#else + success = elpa_solve_evp_real_1stage_single_precision(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols); +#endif if (success != 1) { printf("error in ELPA solve \n"); mpierr = MPI_Abort(MPI_COMM_WORLD, 99); @@ -194,9 +217,12 @@ int main(int argc, char** argv) { printf("\n"); } +#ifdef DOUBLE_PRECISION_REAL /* check the results */ - status = check_correctness_real_from_fortran(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2); - + status = check_correctness_real_from_fortran_double_precision(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2); +#else + status = check_correctness_real_from_fortran_single_precision(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2); +#endif if (status !=0){ printf("The computed EVs are not correct !\n"); } diff --git a/test/fortran_test_programs/test_complex.F90 b/test/fortran_test_programs/test_complex.F90 index 7a0e9165..f75abd88 100644 --- a/test/fortran_test_programs/test_complex.F90 +++ b/test/fortran_test_programs/test_complex.F90 @@ -140,6 +140,22 @@ program test_complex endif #endif +#ifdef DOUBLE_PRECISION_COMPLEX + if (myid .eq. 0) then + print *," " + print *,"Double precision version of ELPA1 is used" + print *," " + endif +#else + if (myid .eq. 0) then + print *," " + print *,"Single precision version of ELPA1 is used" + print *," " + endif +#endif + + call MPI_BARRIER(MPI_COMM_WORLD, mpierr) + #ifdef HAVE_REDIRECT if (check_redirect_environment_variable()) then if (myid .eq. 0) then diff --git a/test/fortran_test_programs/test_real.F90 b/test/fortran_test_programs/test_real.F90 index 0a364864..b43968d0 100644 --- a/test/fortran_test_programs/test_real.F90 +++ b/test/fortran_test_programs/test_real.F90 @@ -178,6 +178,21 @@ program test_real print *," " endif #endif + +#ifdef DOUBLE_PRECISION_REAL + if (myid .eq. 0) then + print *," " + print *,"Double precision version of ELPA1 is used" + print *," " + endif +#else + if (myid .eq. 0) then + print *," " + print *,"Single precision version of ELPA1 is used" + print *," " + endif +#endif + call MPI_BARRIER(MPI_COMM_WORLD, mpierr) #ifdef HAVE_REDIRECT diff --git a/test/shared_sources/call_elpa1.c b/test/shared_sources/call_elpa1.c index 5de63046..2499a0f4 100644 --- a/test/shared_sources/call_elpa1.c +++ b/test/shared_sources/call_elpa1.c @@ -40,19 +40,32 @@ /* the original distribution, the GNU Lesser General Public License. */ /* */ /* */ +#include "config-f90.h" #include #include #include #include #include +#ifdef DOUBLE_PRECISION_REAL int call_elpa1_real_solver_from_c(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int ncols, int mpi_comm_rows, int mpi_comm_cols) { - return elpa_solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, ncols, mpi_comm_rows, mpi_comm_cols); + return elpa_solve_evp_real_1stage_double_precision(na, nev, a, lda, ev, q, ldq, nblk, ncols, mpi_comm_rows, mpi_comm_cols); } +#else +int call_elpa1_real_solver_from_c(int na, int nev, float *a, int lda, float *ev, float *q, int ldq, int nblk, int ncols, int mpi_comm_rows, int mpi_comm_cols) { + return elpa_solve_evp_real_1stage_single_precision(na, nev, a, lda, ev, q, ldq, nblk, ncols, mpi_comm_rows, mpi_comm_cols); +} +#endif +#ifdef DOUBLE_PRECISION_COMPLEX int call_elpa1_complex_solver_from_c(int na, int nev, complex double *a, int lda, double *ev, complex double *q, int ldq, int nblk, int ncols, int mpi_comm_rows, int mpi_comm_cols) { - return elpa_solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, ncols, mpi_comm_rows, mpi_comm_cols); + return elpa_solve_evp_complex_1stage_double_precision(na, nev, a, lda, ev, q, ldq, nblk, ncols, mpi_comm_rows, mpi_comm_cols); +} +#else +int call_elpa1_complex_solver_from_c(int na, int nev, complex *a, int lda, float *ev, complex *q, int ldq, int nblk, int ncols, int mpi_comm_rows, int mpi_comm_cols) { + return elpa_solve_evp_complex_1stage_single_precision(na, nev, a, lda, ev, q, ldq, nblk, ncols, mpi_comm_rows, mpi_comm_cols); } +#endif int call_elpa_get_comm_from_c(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols) { return elpa_get_communicators(mpi_comm_world, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols); diff --git a/test/shared_sources/check_correctnes.F90 b/test/shared_sources/check_correctnes.F90 index 52d63a04..69fedaf3 100644 --- a/test/shared_sources/check_correctnes.F90 +++ b/test/shared_sources/check_correctnes.F90 @@ -65,7 +65,7 @@ module mod_check_correctness real(kind=rk) :: ev(:) complex(kind=ck) :: xc integer(kind=ik) :: sc_desc(:), mpierr - complex(kind=ck), parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0) + complex(kind=ck), parameter :: CZERO = (0.0_rk,0.0_rk), CONE = (1.0_rk,0.0_rk) integer(kind=ik) :: i real(kind=rk) :: err, errmax @@ -74,35 +74,54 @@ module mod_check_correctness ! 1. Residual (maximum of || A*Zi - Zi*EVi ||) ! tmp1 = A * Z ! as is original stored matrix, Z are the EVs - call pzgemm('N','N',na,nev,na,CONE,as,1,1,sc_desc, & - z,1,1,sc_desc,CZERO,tmp1,1,1,sc_desc) +#ifdef DOUBLE_PRECISION_COMPLEX + call pzgemm('N', 'N', na, nev, na, CONE, as, 1, 1, sc_desc, & + z, 1, 1, sc_desc, CZERO, tmp1, 1, 1, sc_desc) +#else + call pcgemm('N', 'N', na, nev, na, CONE, as, 1, 1, sc_desc, & + z, 1, 1, sc_desc, CZERO, tmp1, 1, 1, sc_desc) +#endif ! tmp2 = Zi*EVi tmp2(:,:) = z(:,:) do i=1,nev xc = ev(i) - call pzscal(na,xc,tmp2,1,i,sc_desc,1) +#ifdef DOUBLE_PRECISION_COMPLEX + call pzscal(na, xc, tmp2, 1, i, sc_desc, 1) +#else + call pcscal(na, xc, tmp2, 1, i, sc_desc, 1) +#endif enddo ! tmp1 = A*Zi - Zi*EVi tmp1(:,:) = tmp1(:,:) - tmp2(:,:) ! Get maximum norm of columns of tmp1 - errmax = 0.0 + errmax = 0.0_rk do i=1,nev xc = 0 - call pzdotc(na,xc,tmp1,1,i,sc_desc,1,tmp1,1,i,sc_desc,1) - errmax = max(errmax, sqrt(real(xc,8))) +#ifdef DOUBLE_PRECISION_COMPLEX + call pzdotc(na, xc, tmp1, 1, i, sc_desc, 1, tmp1, 1, i, sc_desc, 1) +#else + call pcdotc(na, xc, tmp1, 1, i, sc_desc, 1, tmp1, 1, i, sc_desc, 1) +#endif + errmax = max(errmax, sqrt(real(xc,kind=rk))) enddo ! Get maximum error norm over all processors err = errmax - - call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr) +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_allreduce(err, errmax, 1, MPI_REAL8, MPI_MAX, MPI_COMM_WORLD, mpierr) +#else + call mpi_allreduce(err, errmax, 1, MPI_REAL4, MPI_MAX, MPI_COMM_WORLD, mpierr) +#endif if (myid==0) print * if (myid==0) print *,'Error Residual :',errmax - - if (errmax .gt. 5e-12) then +#ifdef DOUBLE_PRECISION_COMPLEX + if (errmax .gt. 5e-12_rk) then +#else + if (errmax .gt. 7e-4_rk) then +#endif status = 1 endif @@ -110,22 +129,36 @@ module mod_check_correctness ! tmp1 = Z**T * Z tmp1 = 0 - call pzgemm('C','N',nev,nev,na,CONE,z,1,1,sc_desc, & - z,1,1,sc_desc,CZERO,tmp1,1,1,sc_desc) - +#ifdef DOUBLE_PRECISION_COMPLEX + call pzgemm('C', 'N', nev, nev, na, CONE, z, 1, 1, sc_desc, & + z, 1, 1, sc_desc, CZERO, tmp1, 1, 1, sc_desc) +#else + call pcgemm('C', 'N', nev, nev, na, CONE, z, 1, 1, sc_desc, & + z, 1, 1, sc_desc, CZERO, tmp1, 1, 1, sc_desc) +#endif ! Initialize tmp2 to unit matrix tmp2 = 0 - call pzlaset('A',nev,nev,CZERO,CONE,tmp2,1,1,sc_desc) - +#ifdef DOUBLE_PRECISION_COMPLEX + call pzlaset('A', nev, nev, CZERO, CONE, tmp2, 1, 1, sc_desc) +#else + call pclaset('A', nev, nev, CZERO, CONE, tmp2, 1, 1, sc_desc) +#endif ! tmp1 = Z**T * Z - Unit Matrix tmp1(:,:) = tmp1(:,:) - tmp2(:,:) ! Get maximum error (max abs value in tmp1) err = maxval(abs(tmp1)) - call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr) +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_allreduce(err, errmax, 1, MPI_REAL8, MPI_MAX, MPI_COMM_WORLD, mpierr) +#else + call mpi_allreduce(err, errmax, 1, MPI_REAL4, MPI_MAX, MPI_COMM_WORLD, mpierr) +#endif if (myid==0) print *,'Error Orthogonality:',errmax - - if (errmax .gt. 5e-12) then +#ifdef DOUBLE_PRECISION_COMPLEX + if (errmax .gt. 5e-12_rk) then +#else + if (errmax .gt. 6e-5_rk) then +#endif status = 1 endif end function @@ -150,34 +183,52 @@ module mod_check_correctness ! 1. Residual (maximum of || A*Zi - Zi*EVi ||) ! tmp1 = A * Z - call pdgemm('N','N',na,nev,na,1.d0,as,1,1,sc_desc, & - z,1,1,sc_desc,0.d0,tmp1,1,1,sc_desc) - +#ifdef DOUBLE_PRECISION_REAL + call pdgemm('N', 'N', na, nev, na, 1.0_rk, as, 1, 1, sc_desc, & + z, 1, 1, sc_desc, 0.0_rk, tmp1, 1, 1, sc_desc) +#else + call psgemm('N', 'N', na, nev, na, 1.0_rk, as, 1, 1, sc_desc, & + z, 1, 1, sc_desc, 0.0_rk, tmp1, 1, 1, sc_desc) +#endif ! tmp2 = Zi*EVi tmp2(:,:) = z(:,:) do i=1,nev - call pdscal(na,ev(i),tmp2,1,i,sc_desc,1) +#ifdef DOUBLE_PRECISION_REAL + call pdscal(na, ev(i), tmp2, 1, i, sc_desc, 1) +#else + call psscal(na, ev(i), tmp2, 1, i, sc_desc, 1) +#endif enddo ! tmp1 = A*Zi - Zi*EVi tmp1(:,:) = tmp1(:,:) - tmp2(:,:) ! Get maximum norm of columns of tmp1 - errmax = 0.0 + errmax = 0.0_rk do i=1,nev - err = 0.0 - call pdnrm2(na,err,tmp1,1,i,sc_desc,1) + err = 0.0_rk +#ifdef DOUBLE_PRECISION_REAL + call pdnrm2(na, err, tmp1, 1, i, sc_desc, 1) +#else + call psnrm2(na, err, tmp1, 1, i, sc_desc, 1) +#endif errmax = max(errmax, err) enddo ! Get maximum error norm over all processors err = errmax - - call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_allreduce(err, errmax, 1, MPI_REAL8, MPI_MAX, MPI_COMM_WORLD, mpierr) +#else + call mpi_allreduce(err, errmax, 1, MPI_REAL4, MPI_MAX, MPI_COMM_WORLD, mpierr) +#endif if (myid==0) print * if (myid==0) print *,'Error Residual :',errmax - - if (errmax .gt. 9e-9) then +#ifdef DOUBLE_PRECISION_REAL + if (errmax .gt. 9e-12_rk) then +#else + if (errmax .gt. 7e-4_rk) then +#endif status = 1 endif @@ -185,63 +236,110 @@ module mod_check_correctness ! tmp1 = Z**T * Z tmp1 = 0 - call pdgemm('T','N',nev,nev,na,1.d0,z,1,1,sc_desc, & - z,1,1,sc_desc,0.d0,tmp1,1,1,sc_desc) +#ifdef DOUBLE_PRECISION_REAL + call pdgemm('T', 'N', nev, nev, na, 1.0_rk, z, 1, 1, sc_desc, & + z, 1, 1, sc_desc, 0.0_rk, tmp1, 1, 1, sc_desc) +#else + call psgemm('T', 'N', nev, nev, na, 1.0_rk, z, 1, 1, sc_desc, & + z, 1, 1, sc_desc, 0.0_rk, tmp1, 1, 1, sc_desc) +#endif ! Initialize tmp2 to unit matrix tmp2 = 0 - call pdlaset('A',nev,nev,0.d0,1.d0,tmp2,1,1,sc_desc) - +#ifdef DOUBLE_PRECISION_REAL + call pdlaset('A', nev, nev, 0.0_rk, 1.0_rk, tmp2, 1, 1, sc_desc) +#else + call pslaset('A', nev, nev, 0.0_rk, 1.0_rk, tmp2, 1, 1, sc_desc) +#endif ! tmp1 = Z**T * Z - Unit Matrix tmp1(:,:) = tmp1(:,:) - tmp2(:,:) ! Get maximum error (max abs value in tmp1) err = maxval(abs(tmp1)) - call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_allreduce(err, errmax, 1, MPI_REAL8, MPI_MAX, MPI_COMM_WORLD, mpierr) +#else + call mpi_allreduce(err, errmax, 1, MPI_REAL4, MPI_MAX, MPI_COMM_WORLD, mpierr) +#endif if (myid==0) print *,'Error Orthogonality:',errmax - - if (errmax .gt. 9e-9) then +#ifdef DOUBLE_PRECISION_REAL + if (errmax .gt. 9e-12) then +#else + if (errmax .gt. 6e-5) then +#endif status = 1 endif end function - - !c> int check_correctness_real_from_fortran(int na, int nev, int na_rows, int na_cols, +#ifdef DOUBLE_PRECISION_REAL + !c> int check_correctness_real_from_fortran_double_precision(int na, int nev, int na_rows, int na_cols, !c> double *as, double *z, double *ev, !c> int sc_desc[9], int myid, !c> double *tmp1, double *tmp2); +#else + !c> int check_correctness_real_from_fortran_single_precision(int na, int nev, int na_rows, int na_cols, + !c> float *as, float *z, float *ev, + !c> int sc_desc[9], int myid, + !c> float *tmp1, float *tmp2); +#endif function check_correctness_real_wrapper(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2) result(status) & - bind(C,name="check_correctness_real_from_fortran") - +#ifdef DOUBLE_PRECISION_REAL + bind(C,name="check_correctness_real_from_fortran_double_precision") +#else + bind(C,name="check_correctness_real_from_fortran_single_precision") +#endif use iso_c_binding implicit none integer(kind=c_int) :: status integer(kind=c_int), value :: na, nev, myid, na_rows, na_cols +#ifdef DOUBLE_PRECISION_REAL real(kind=c_double) :: as(1:na_rows,1:na_cols), z(1:na_rows,1:na_cols) real(kind=c_double) :: tmp1(1:na_rows,1:na_cols), tmp2(1:na_rows,1:na_cols) real(kind=c_double) :: ev(1:na) +#else + real(kind=c_float) :: as(1:na_rows,1:na_cols), z(1:na_rows,1:na_cols) + real(kind=c_float) :: tmp1(1:na_rows,1:na_cols), tmp2(1:na_rows,1:na_cols) + real(kind=c_float) :: ev(1:na) +#endif integer(kind=c_int) :: sc_desc(1:9) status = check_correctness_real(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2) end function - !c> int check_correctness_complex_from_fortran(int na, int nev, int na_rows, int na_cols, + +#ifdef DOUBLE_PRECISION_COMPLEX + !c> int check_correctness_complex_from_fortran_double_precision(int na, int nev, int na_rows, int na_cols, !c> complex double *as, complex double *z, double *ev, !c> int sc_desc[9], int myid, !c> complex double *tmp1, complex double *tmp2); +#else + !c> int check_correctness_complex_from_fortran_single_precision(int na, int nev, int na_rows, int na_cols, + !c> complex *as, complex *z, float *ev, + !c> int sc_desc[9], int myid, + !c> complex *tmp1, complex *tmp2); +#endif function check_correctness_complex_wrapper(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2) result(status) & - bind(C,name="check_correctness_complex_from_fortran") - +#ifdef DOUBLE_PRECISION_COMPLEX + bind(C,name="check_correctness_complex_from_fortran_double_precision") +#else + bind(C,name="check_correctness_complex_from_fortran_single_precision") +#endif use iso_c_binding implicit none integer(kind=c_int) :: status integer(kind=c_int), value :: na, nev, myid, na_rows, na_cols +#ifdef DOUBLE_PRECISION_COMPLEX complex(kind=c_double) :: as(1:na_rows,1:na_cols), z(1:na_rows,1:na_cols) complex(kind=c_double) :: tmp1(1:na_rows,1:na_cols), tmp2(1:na_rows,1:na_cols) real(kind=c_double) :: ev(1:na) +#else + complex(kind=c_float) :: as(1:na_rows,1:na_cols), z(1:na_rows,1:na_cols) + complex(kind=c_float) :: tmp1(1:na_rows,1:na_cols), tmp2(1:na_rows,1:na_cols) + real(kind=c_float) :: ev(1:na) +#endif integer(kind=c_int) :: sc_desc(1:9) status = check_correctness_complex(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2) diff --git a/test/shared_sources/mod_from_c.F90 b/test/shared_sources/mod_from_c.F90 index 0616ee92..faf65c60 100644 --- a/test/shared_sources/mod_from_c.F90 +++ b/test/shared_sources/mod_from_c.F90 @@ -40,6 +40,7 @@ ! the original distribution, the GNU Lesser General Public License. ! ! +#include "config-f90.h" module from_c implicit none @@ -54,7 +55,11 @@ module from_c implicit none integer(kind=c_int), value :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols +#ifdef DOUBLE_PRECISION_REAL real(kind=c_double) :: a(1:lda,1:matrixCOls), ev(1:na), q(1:ldq,1:matrixCols) +#else + real(kind=c_float) :: a(1:lda,1:matrixCOls), ev(1:na), q(1:ldq,1:matrixCols) +#endif end function elpa1_real_c @@ -69,9 +74,13 @@ module from_c implicit none integer(kind=c_int), value :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols +#ifdef DOUBLE_PRECISION_COMPLEX real(kind=c_double) :: ev(1:na) complex(kind=c_double) :: a(1:lda,1:matrixCOls), q(1:ldq,1:matrixCols) - +#else + real(kind=c_float) :: ev(1:na) + complex(kind=c_float) :: a(1:lda,1:matrixCOls), q(1:ldq,1:matrixCols) +#endif end function elpa1_complex_c @@ -101,9 +110,11 @@ module from_c integer(kind=ik) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols logical :: success integer(kind=ik) :: successC - +#ifdef DOUBLE_PRECISION_REAL real(kind=c_double) :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols) - +#else + real(kind=c_float) :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols) +#endif successC = elpa1_real_c(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols) @@ -126,10 +137,13 @@ module from_c integer(kind=ik) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols logical :: success integer(kind=ik) :: successC - +#ifdef DOUBLE_PRECISION_COMPLEX real(kind=c_double) :: ev(1:na) complex(kind=c_double) :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols) - +#else + real(kind=c_float) :: ev(1:na) + complex(kind=c_float) :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols) +#endif successC = elpa1_complex_c(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols) diff --git a/test/shared_sources/prepare_matrix.F90 b/test/shared_sources/prepare_matrix.F90 index e9a44196..303a5402 100644 --- a/test/shared_sources/prepare_matrix.F90 +++ b/test/shared_sources/prepare_matrix.F90 @@ -40,6 +40,8 @@ ! the original distribution, the GNU Lesser General Public License. ! ! +#include "config-f90.h" + module mod_prepare_matrix interface prepare_matrix @@ -59,7 +61,7 @@ module mod_prepare_matrix real(kind=rk), intent(inout) :: xr(:,:) complex(kind=ck), intent(inout) :: z(:,:), a(:,:), as(:,:) - complex(kind=ck), parameter :: CZERO = (0.d0, 0.d0), CONE = (1.d0, 0.d0) + complex(kind=ck), parameter :: CZERO = (0.0_rk, 0.0_rk), CONE = (1.0_rk, 0.0_rk) ! for getting a hermitian test matrix A we get a random matrix Z ! and calculate A = Z + Z**H @@ -72,7 +74,7 @@ module mod_prepare_matrix call RANDOM_NUMBER(xr) z(:,:) = xr(:,:) call RANDOM_NUMBER(xr) - z(:,:) = z(:,:) + (0.d0,1.d0)*xr(:,:) + z(:,:) = z(:,:) + (0.0_rk,1.0_rk)*xr(:,:) a(:,:) = z(:,:) @@ -80,8 +82,11 @@ module mod_prepare_matrix print '(a)','| Random matrix block has been set up. (only processor 0 confirms this step)' endif +#ifdef DOUBLE_PRECISION_COMPLEX call pztranc(na, na, CONE, z, 1, 1, sc_desc, CONE, a, 1, 1, sc_desc) ! A = A + Z**H - +#else + call pctranc(na, na, CONE, z, 1, 1, sc_desc, CONE, a, 1, 1, sc_desc) ! A = A + Z**H +#endif if (myid == 0) then print '(a)','| Random matrix block has been symmetrized' endif @@ -117,8 +122,11 @@ module mod_prepare_matrix print '(a)','| Random matrix block has been set up. (only processor 0 confirms this step)' endif - call pdtran(na, na, 1.d0, z, 1, 1, sc_desc, 1.d0, a, 1, 1, sc_desc) ! A = A + Z**T - +#ifdef DOUBLE_PRECISION_REAL + call pdtran(na, na, 1.0_rk, z, 1, 1, sc_desc, 1.0_rk, a, 1, 1, sc_desc) ! A = A + Z**T +#else + call pstran(na, na, 1.0_rk, z, 1, 1, sc_desc, 1.0_rk, a, 1, 1, sc_desc) ! A = A + Z**T +#endif if (myid == 0) then print '(a)','| Random matrix block has been symmetrized' endif @@ -128,12 +136,21 @@ module mod_prepare_matrix as = a end subroutine - - !c> void prepare_matrix_real_from_fortran(int na, int myid, int na_rows, int na_cols, +#ifdef DOUBLE_PRECISION_REAL + !c> void prepare_matrix_real_from_fortran_double_precision(int na, int myid, int na_rows, int na_cols, !c> int sc_desc[9], int iseed[4096], !c> double *a, double *z, double *as); +#else + !c> void prepare_matrix_real_from_fortran_single_precision(int na, int myid, int na_rows, int na_cols, + !c> int sc_desc[9], int iseed[4096], + !c> float *a, float *z, float *as); +#endif subroutine prepare_matrix_real_wrapper(na, myid, na_rows, na_cols, sc_desc, iseed, a, z, as) & - bind(C, name="prepare_matrix_real_from_fortran") +#ifdef DOUBLE_PRECISION_REAL + bind(C, name="prepare_matrix_real_from_fortran_double_precision") +#else + bind(C, name="prepare_matrix_real_from_fortran_single_precision") +#endif use iso_c_binding implicit none @@ -141,16 +158,31 @@ module mod_prepare_matrix integer(kind=c_int) , value :: myid, na, na_rows, na_cols integer(kind=c_int) :: sc_desc(1:9) integer(kind=c_int) :: iseed(1:4096) +#ifdef DOUBLE_PRECISION_REAL real(kind=c_double) :: z(1:na_rows,1:na_cols), a(1:na_rows,1:na_cols), & as(1:na_rows,1:na_cols) - +#else + real(kind=c_float) :: z(1:na_rows,1:na_cols), a(1:na_rows,1:na_cols), & + as(1:na_rows,1:na_cols) +#endif call prepare_matrix_real(na, myid, sc_desc, iseed, a, z, as) end subroutine - !c> void prepare_matrix_complex_from_fortran(int na, int myid, int na_rows, int na_cols, + +#ifdef DOUBLE_PRECISION_COMPLEX + !c> void prepare_matrix_complex_from_fortran_double_precision(int na, int myid, int na_rows, int na_cols, !c> int sc_desc[9], int iseed[4096], double *xr, !c> complex double *a, complex double *z, complex double *as); +#else + !c> void prepare_matrix_complex_from_fortran_single_precision(int na, int myid, int na_rows, int na_cols, + !c> int sc_desc[9], int iseed[4096], float *xr, + !c> complex *a, complex *z, complex *as); +#endif subroutine prepare_matrix_complex_wrapper(na, myid, na_rows, na_cols, sc_desc, iseed, xr, a, z, as) & - bind(C, name="prepare_matrix_complex_from_fortran") +#ifdef DOUBLE_PRECISION_COMPLEX + bind(C, name="prepare_matrix_complex_from_fortran_double_precision") +#else + bind(C, name="prepare_matrix_complex_from_fortran_single_precision") +#endif use iso_c_binding implicit none @@ -158,9 +190,15 @@ module mod_prepare_matrix integer(kind=c_int) , value :: myid, na, na_rows, na_cols integer(kind=c_int) :: sc_desc(1:9) integer(kind=c_int) :: iseed(1:4096) +#ifdef DOUBLE_PRECISION_COMPLEX real(kind=c_double) :: xr(1:na_rows,1:na_cols) complex(kind=c_double) :: z(1:na_rows,1:na_cols), a(1:na_rows,1:na_cols), & as(1:na_rows,1:na_cols) +#else + real(kind=c_float) :: xr(1:na_rows,1:na_cols) + complex(kind=c_float) :: z(1:na_rows,1:na_cols), a(1:na_rows,1:na_cols), & + as(1:na_rows,1:na_cols) +#endif call prepare_matrix_complex(na, myid, sc_desc, iseed, xr, a, z, as) end subroutine -- GitLab