Commit 2f8836f7 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master' into ELPA_GPU

parents 1151f6a5 392d782a
......@@ -29,7 +29,7 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/elpa_qr/qr_utils.f90 \
src/elpa_qr/elpa_qrkernels.f90 \
src/elpa_qr/elpa_pdlarfb.f90 \
src/elpa_qr/elpa_pdgeqrf.f90
src/elpa_qr/elpa_pdgeqrf.F90
if HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/timer.F90 \
src/ftimings/ftimings.F90 \
......@@ -48,19 +48,19 @@ if WITH_GPU_VERSION
endif
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.F90
endif
if WITH_COMPLEX_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.F90
endif
if WITH_REAL_GENERIC_SIMPLE_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_simple.f90
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_simple.F90
endif
if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.F90
endif
if WITH_REAL_BGP_KERNEL
......@@ -320,6 +320,9 @@ elpa2.i: $(top_srcdir)/src/elpa2.F90
elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 -o $@
mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@
......
......@@ -721,6 +721,11 @@ DX_MAN_FEATURE(ON)
DX_HTML_FEATURE(ON)
DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs])
DESPERATELY_WANT_ASSUMED_SIZE=0
if text x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable])
fi
AC_SUBST([WITH_MKL])
AC_SUBST([WITH_BLACS])
AC_SUBST([with_amd_bulldozer_kernel])
......
......@@ -137,9 +137,12 @@ module ELPA1_compute
implicit none
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols), d(na), e(na), tau(na)
! was
! real a(lda,*)
real(kind=rk) :: d(na), e(na), tau(na)
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk) :: a(lda,*)
#else
real(kind=rk) :: a(lda,matrixCols)
#endif
integer(kind=ik), parameter :: max_stored_rows = 32
......@@ -480,9 +483,12 @@ module ELPA1_compute
implicit none
integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols), q(ldq,matrixCols), tau(na)
! was
! real a(lda,*), q(ldq,*)
real(kind=rk) :: tau(na)
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk) :: a(lda,*), q(ldq,*)
#else
real(kind=rk) :: a(lda,matrixCols), q(ldq,matrixCols)
#endif
integer(kind=ik) :: max_stored_rows
......@@ -914,9 +920,12 @@ module ELPA1_compute
implicit none
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols), tau(na)
! was
! complex a(lda,*)
complex(kind=ck) :: tau(na)
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck) :: a(lda,*)
#else
complex(kind=ck) :: a(lda,matrixCols)
#endif
real(kind=rk) :: d(na), e(na)
integer(kind=ik), parameter :: max_stored_rows = 32
......@@ -1283,10 +1292,12 @@ module ELPA1_compute
implicit none
integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols), tau(na)
! was
! complex a(lda,*), q(ldq,*)
complex(kind=ck) :: tau(na)
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck) :: a(lda,*), q(ldq,*)
#else
complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols)
#endif
integer(kind=ik) :: max_stored_rows
complex(kind=ck), parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
......@@ -1491,7 +1502,7 @@ module ELPA1_compute
character*1 :: uplo_a, uplo_c
integer(kind=ik), intent(in) :: lda, ldaCols, ldb, ldbCols, ldc, ldcCols
integer(kind=ik) :: na, ncb, nblk, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,ldaCols), b(ldb,ldbCols), c(ldc,ldcCols)
complex(kind=ck) :: a(lda,ldaCols), b(ldb,ldbCols), c(ldc,ldcCols) ! removed assumed_size
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_rows_np
......@@ -1685,9 +1696,12 @@ module ELPA1_compute
implicit none
integer(kind=ik) :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: d(na), e(na), q(ldq,matrixCols)
! was
! real q(ldq,*)
real(kind=rk) :: d(na), e(na)
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk) :: q(ldq,*)
#else
real(kind=rk) :: q(ldq,matrixCols)
#endif
integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
......@@ -1911,9 +1925,12 @@ module ELPA1_compute
implicit none
integer(kind=ik) :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
real(kind=rk) :: d(na), e(na), q(ldq,matrixCols)
! was
! real q(ldq,*)
real(kind=rk) :: d(na), e(na)
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk) :: q(ldq,*)
#else
real(kind=rk) :: q(ldq,matrixCols)
#endif
integer(kind=ik), parameter :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used
......@@ -2186,9 +2203,12 @@ module ELPA1_compute
integer(kind=ik) :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, npc_0, npc_n
integer(kind=ik) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
real(kind=rk) :: d(na), e, q(ldq,matrixCols)
! was
! real q(ldq,*)
real(kind=rk) :: d(na), e
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk) :: q(ldq,*)
#else
real(kind=rk) :: q(ldq,matrixCols)
#endif
integer(kind=ik), parameter :: max_strip=128
......@@ -2234,8 +2254,12 @@ module ELPA1_compute
! If my processor column isn't in the requested set, do nothing
if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) return
if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems")
#endif
return
endif
! Determine number of "next" and "prev" column for ring sends
if (my_pcol == npc_0+npc_n-1) then
......@@ -2251,11 +2275,19 @@ module ELPA1_compute
endif
call check_monotony(nm,d,'Input1',wantDebug, success)
if (.not.(success)) return
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems")
#endif
return
endif
call check_monotony(na-nm,d(nm+1),'Input2',wantDebug, success)
if (.not.(success)) return
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems")
#endif
return
endif
! Get global number of processors and my processor number.
! Please note that my_proc does not need to match any real processor number,
! it is just used for load balancing some loops.
......@@ -2458,9 +2490,19 @@ module ELPA1_compute
enddo
call check_monotony(na1,d1,'Sorted1', wantDebug, success)
if (.not.(success)) return
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems")
#endif
return
endif
call check_monotony(na2,d2,'Sorted2', wantDebug, success)
if (.not.(success)) return
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems")
#endif
return
endif
if (na1==1 .or. na1==2) then
! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid
......@@ -2631,8 +2673,12 @@ module ELPA1_compute
d(i) = tmp(idx(i))
enddo
call check_monotony(na,d,'Output', wantDebug, success)
if (.not.(success)) return
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems")
#endif
return
endif
! Eigenvector calculations
......@@ -3505,10 +3551,11 @@ module ELPA1_compute
implicit none
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk) :: a(lda,*)
#else
real(kind=rk) :: a(lda,matrixCols)
! was
! real a(lda,*)
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
integer(kind=ik) :: n, nc, i, info, ns, nb
......@@ -3643,10 +3690,11 @@ module ELPA1_compute
implicit none
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck) :: a(lda,*)
#else
complex(kind=ck) :: a(lda,matrixCols)
!was
! complex a(lda,*)
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
integer(kind=ik) :: n, nc, i, info
......@@ -3822,10 +3870,11 @@ module ELPA1_compute
implicit none
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck) :: a(lda,*)
#else
complex(kind=ck) :: a(lda,matrixCols)
! was
! complex a(lda,*)
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
integer(kind=ik) :: n, nc, i, info, ns, nb
......
......@@ -142,13 +142,16 @@ module ELPA2_compute
implicit none
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk) :: a(lda,*), tmat(nbw,nbw,*)
#else
real(kind=rk) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
! was
! real a(lda,*), tmat(nbw,nbw,*)
#endif
real(kind=rk) :: eps
logical, intent(in) :: useGPU
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows
integer(kind=ik) :: l_cols, l_rows, vmrCols
integer(kind=ik) :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
integer(kind=ik) :: istep, ncol, lch, lcx, nlc, mynlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
......@@ -224,7 +227,7 @@ module ELPA2_compute
endif
if (which_qr_decomposition == 1) then
call qr_pqrparam_init(pqrparam, nblk,'M',0, nblk,'M',0, nblk,'M',1,'s')
call qr_pqrparam_init(pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s')
allocate(tauvector(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating tauvector "//errorMessage
......@@ -244,8 +247,18 @@ module ELPA2_compute
stop
endif
call qr_pdgeqrf_2dcomm(a, lda, vmrCPU, max(l_rows,1), tauvector, tmat(1,1,1), nbw, dwork_size(1), -1, na, &
nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
vmrCols = na
#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
call qr_pdgeqrf_2dcomm(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
#else
call qr_pdgeqrf_2dcomm(a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif
work_size = dwork_size(1)
allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -447,12 +460,24 @@ module ELPA2_compute
if (useQR) then
if (which_qr_decomposition == 1) then
call qr_pdgeqrf_2dcomm(a, lda, vmrCPU, max(l_rows,1), tauvector(1), &
tmat(1,1,istep), nbw, work_blocked, &
work_size, na, n_cols, nblk, nblk, &
istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
blockheuristic)
vmrCols = 2*n_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
call qr_pdgeqrf_2dcomm(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size, &
work_size, na, n_cols, nblk, nblk, &
istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
blockheuristic)
#else
call qr_pdgeqrf_2dcomm(a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) , &
max(l_rows,1), vmrCols, tauvector(1:na), na, &
tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
work_size, na, n_cols, nblk, nblk, &
istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
blockheuristic)
#endif
endif
else !useQR
......@@ -1162,8 +1187,11 @@ module ELPA2_compute
use precision
implicit none
integer(kind=ik) :: n, lda, ldb, comm
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk) :: a(lda,*)
#else
real(kind=rk) :: a(lda,ldb)
#endif
integer(kind=ik) :: i, nc, mpierr
real(kind=rk) :: h1(n*n), h2(n*n)
......@@ -1237,10 +1265,11 @@ module ELPA2_compute
implicit none
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk) :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
#else
real(kind=rk) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
! was
! real a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: max_blocks_row, max_blocks_col, max_local_rows, &
max_local_cols
......@@ -1690,9 +1719,11 @@ module ELPA2_compute
implicit none
integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk), intent(in) :: a(lda,*)
#else
real(kind=rk), intent(in) :: a(lda,matrixCols)
! was
! real a(lda,*)
#endif
real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0
real(kind=rk), intent(out), &
allocatable :: hh_trans_real(:,:)
......@@ -2482,9 +2513,11 @@ module ELPA2_compute
integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk) :: q(ldq,*)
#else
real(kind=rk) :: q(ldq,matrixCols)
! was
! real q(ldq,*)
#endif
real(kind=rk), intent(out) :: hh_trans_real(:,:)
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
......@@ -3994,6 +4027,7 @@ module ELPA2_compute
if (na <= 0) then
limits(:) = 0
call timer%stop("determine_workload")
return
endif
......@@ -4055,10 +4089,11 @@ module ELPA2_compute
logical, intent(in) :: useGPU
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck) :: a(lda,*), tmat(nbw,nbw,*)
#else
complex(kind=ck) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
! was
! complex a(lda,*), tmat(nbw,nbw,*)
#endif
complex(kind=ck), parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
......@@ -4862,9 +4897,11 @@ module ELPA2_compute
logical, intent(in) :: useGPU
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
! was
! complex a(lda,*),q(ldq,*),tmat(nbw,nbw,*)
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck) :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
#else
complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif
complex(kind=ck), parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
......@@ -5204,9 +5241,11 @@ module ELPA2_compute
!#endif
integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck),intent(in) :: a(lda,*)
#else
complex(kind=ck), intent(in) :: a(lda,matrixCols)
! was
! complex a(lda,*)
#endif
real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0
complex(kind=ck), intent(inout), &
allocatable :: hh_trans_complex(:,:)
......@@ -6190,9 +6229,11 @@ module ELPA2_compute
logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck) :: q(ldq,*)
#else
complex(kind=ck) :: q(ldq,matrixCols)
! was
! complex q(ldq,*)
#endif
complex(kind=ck) :: hh_trans_complex(:,:)
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
integer(kind=ik) :: tmp
......@@ -8177,8 +8218,8 @@ end subroutine
implicit none
integer(kind=ik), intent(in) :: na, nb, nbCol, nb2, nb2Col, mpi_comm
real(kind=rk), intent(inout) :: ab(2*nb,nbCol)
real(kind=rk), intent(inout) :: ab2(2*nb2,nb2Col)
real(kind=rk), intent(inout) :: ab(2*nb,nbCol) ! removed assumed size
real(kind=rk), intent(inout) :: ab2(2*nb2,nb2Col) ! removed assumed size
real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0
real(kind=rk) :: hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), &
......@@ -8494,7 +8535,7 @@ end subroutine
integer(kind=ik), intent(in) :: nb !width of matrix W and Y
integer(kind=ik), intent(in) :: lda !leading dimension of A
integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y
real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed
real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size
real(kind=rk), intent(in) :: W(m,nb) !blocked transformation matrix W
real(kind=rk), intent(in) :: Y(m,nb) !blocked transformation matrix Y
real(kind=rk), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
......@@ -8523,7 +8564,7 @@ end subroutine
integer(kind=ik), intent(in) :: nb !width of matrix W and Y
integer(kind=ik), intent(in) :: lda !leading dimension of A
integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y
real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed
real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size
real(kind=rk), intent(in) :: W(m,nb) !blocked transformation matrix W
real(kind=rk), intent(in) :: Y(m,nb) !blocked transformation matrix Y
real(kind=rk), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
......@@ -8551,7 +8592,7 @@ end subroutine
integer(kind=ik), intent(in) :: nb !width of matrix W and Y
integer(kind=ik), intent(in) :: lda !leading dimension of A
integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y
real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed
real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size
real(kind=rk), intent(in) :: W(n,nb) !blocked transformation matrix W
real(kind=rk), intent(in) :: Y(n,nb) !blocked transformation matrix Y
real(kind=rk) :: mem(n,nb) !memory for a temporary matrix of size n x nb
......
......@@ -53,6 +53,9 @@
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
#include "config-f90.h"
module complex_generic_kernel
private
......@@ -60,13 +63,24 @@ module complex_generic_kernel
contains
subroutine single_hh_trafo_complex_generic(q, hh, nb, nq, ldq)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck), intent(inout) :: q(ldq,*)
complex(kind=ck), intent(in) :: hh(*)
#else
complex(kind=ck), intent(inout) :: q(1:ldq,1:nb)
complex(kind=ck), intent(in) :: hh(1:nb)
#endif
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: single_hh_trafo_complex_generic")
#endif
! Safety only:
......@@ -77,33 +91,58 @@ contains
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do i=1,nq-8,12
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
call hh_trafo_complex_kernel_12(q(i,1),hh, nb, ldq)
#else
call hh_trafo_complex_kernel_12(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
#endif
enddo
! i > nq-8 now, i.e. at most 8 rows remain
if(nq-i+1 > 4) then
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
call hh_trafo_complex_kernel_8(q(i,1),hh, nb, ldq)
#else
call hh_trafo_complex_kernel_8(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
#endif
else if(nq-i+1 > 0) then