Commit 28854290 authored by Andreas Marek's avatar Andreas Marek
Browse files

Templetize QR, fix openmp build

parent c0628347
......@@ -78,6 +78,9 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2/elpa2_compute_complex_template.X90 \
src/elpa1/elpa1_template.X90 \
src/elpa2/elpa2_template.X90 \
src/elpa2/qr/qr_utils_template.X90 \
src/elpa2/qr/elpa_pdlarfb_template.X90 \
src/elpa2/qr/elpa_pdgeqrf_template.X90 \
src/elpa2/elpa2_bandred_template.X90 \
src/elpa2/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2/elpa2_trans_ev_band_to_full_template.X90 \
......@@ -1235,6 +1238,9 @@ EXTRA_DIST = \
src/elpa2/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa1/elpa1_template.X90 \
src/elpa2/elpa2_template.X90 \
src/elpa2/qr/qr_utils_template.X90 \
src/elpa2/qr/elpa_pdlarfb_template.X90 \
src/elpa2/qr/elpa_pdgeqrf_template.X90 \
src/elpa2/elpa2_tridiag_band_template.X90 \
src/elpa2/elpa2_trans_ev_band_to_full_template.X90 \
src/elpa2/elpa2_trans_ev_tridi_to_band_template.X90 \
......@@ -1253,7 +1259,7 @@ EXTRA_DIST = \
src/elpa1/elpa_invert_trm.X90 \
src/elpa1/elpa_multiply_a_b.X90 \
src/elpa1/elpa_solve_tridi_impl_public.X90 \
src/elpa2/qr/elpa_qrkernels.X90 \
src/elpa2/qr/elpa_qrkernels_template.X90 \
src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.Xcu \
src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.Xcu \
src/GPU/cuUtils_template.Xcu \
......
......@@ -508,7 +508,7 @@
call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
!$OMP PARALLEL DO PRIVATE(i) SHARED(na1, my_proc, n_procs, &
!$OMP d1,dbase, ddiff, z, ev_scale) &
!$OMP d1,dbase, ddiff, z, ev_scale, obj) &
!$OMP DEFAULT(NONE)
#endif
......
......@@ -324,14 +324,14 @@
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
& (obj, a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
& (obj, a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
nbw, nl, stripe_width, nbw)
#endif
......@@ -342,14 +342,14 @@
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
& (obj, a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
& (obj, a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
#endif
#endif /* WITH_OPENMP */
......@@ -379,13 +379,13 @@
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
& (obj, a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
& (obj, a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif
#else /* WITH_OPENMP */
......@@ -395,13 +395,13 @@
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
& (obj, a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
& (obj, a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif
#endif /* WITH_OPENMP */
......@@ -429,13 +429,13 @@
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
& (obj, a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
& (obj, a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#endif
......@@ -446,13 +446,13 @@
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
& (obj, a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
& (obj, a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
#endif
......@@ -481,7 +481,7 @@
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
& (obj, a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
......@@ -497,13 +497,13 @@
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
& (obj, a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
& (obj, a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif
#endif /* WITH_OPENMP */
......@@ -995,7 +995,7 @@
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width, 1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
& (obj, a(1:stripe_width, 1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl,stripe_width)
#else
if (j==1) call single_hh_trafo_&
......@@ -1065,14 +1065,14 @@
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
& (obj,a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
& (obj,a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */
......@@ -1139,14 +1139,14 @@
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
& (obj, a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
& (obj, a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */
......@@ -1213,14 +1213,14 @@
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
& (obj, a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
& (obj, a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
......@@ -1307,14 +1307,14 @@
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
& (obj, a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
& (obj, a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_REAL_KERNEL
endif
......@@ -1398,14 +1398,14 @@
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
& (obj, a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
& (obj, a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_REAL_KERNEL
......@@ -1488,14 +1488,14 @@
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
& (obj, a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
& (obj, a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_REAL_KERNEL
endif
......
......@@ -2329,7 +2329,7 @@
call pack_row_complex_cpu_openmp_&
#endif
&PRECISION&
&(aIntern, row, j*nblk+i+a_off, stripe_width, stripe_count, max_threads, thread_width, l_nev)
&(obj,aIntern, row, j*nblk+i+a_off, stripe_width, stripe_count, max_threads, thread_width, l_nev)
#else /* WITH_OPENMP */
#if REALCASE == 1
......@@ -2368,7 +2368,7 @@
call pack_row_complex_cpu_openmp_&
#endif
&PRECISION&
&(aIntern, result_buffer(:,i,nbuf), j*nblk+i+a_off, stripe_width, stripe_count, &
&(obj,aIntern, result_buffer(:,i,nbuf), j*nblk+i+a_off, stripe_width, stripe_count, &
max_threads, thread_width, l_nev)
#else /* WITH_OPENMP */
......
......@@ -462,7 +462,7 @@
endif
! Get the OpenMP block limits
call divide_band(nblocks, max_threads, omp_block_limits)
call divide_band(obj,nblocks, max_threads, omp_block_limits)
allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -782,7 +782,7 @@
call hh_transform_complex_&
#endif
&PRECISION &
(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
(obj,ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
#if REALCASE == 1
hv_t(1 ,my_thread) = CONST_1_0
......
......@@ -59,13 +59,15 @@
! the Intel compiler creates a temp array copy of array q!
! this should be prevented, if possible without using assumed size arrays
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine single_hh_trafo_complex_generic_double(q, hh, nb, nq, ldq)
subroutine single_hh_trafo_complex_generic_double(obj, q, hh, nb, nq, ldq)
#else
subroutine single_hh_trafo_complex_generic_single(q, hh, nb, nq, ldq)
subroutine single_hh_trafo_complex_generic_single(obj, q, hh, nb, nq, ldq)
#endif
use precision
use elpa_api
implicit none
class(elpa_t) :: obj
integer(kind=ik), intent(in) :: nb, nq, ldq
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq,*)
......@@ -79,11 +81,11 @@
#ifdef DOUBLE_PRECISION_COMPLEX
! call obj%timer%start("kernel generic: single_hh_trafo_complex_generic_double")
call obj%timer%start("kernel generic: single_hh_trafo_complex_generic_double")
#else
! call obj%timer%start("kernel generic: single_hh_trafo_complex_generic_single")
call obj%timer%start("kernel generic: single_hh_trafo_complex_generic_single")
#endif
! Safety only:
......@@ -156,11 +158,11 @@
#ifdef DOUBLE_PRECISION_COMPLEX
! call obj%timer%stop("kernel generic: single_hh_trafo_complex_generic_double")
call obj%timer%stop("kernel generic: single_hh_trafo_complex_generic_double")
#else
! call obj%timer%stop("kernel generic: single_hh_trafo_complex_generic_single")
call obj%timer%stop("kernel generic: single_hh_trafo_complex_generic_single")
#endif
......
......@@ -62,12 +62,14 @@
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (q, hh, nb, nq, ldq, ldh)
& (obj, q, hh, nb, nq, ldq, ldh)
use precision
use iso_c_binding
use elpa_api
implicit none
class(elpa_t) :: obj
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#ifdef USE_ASSUMED_SIZE
real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
......@@ -84,11 +86,11 @@
! Safety only:
! call obj%timer%start("kernel generic: double_hh_trafo_&
! &MATH_DATATYPE&
! &_generic" // &
! &PRECISION_SUFFIX &
! )
call obj%timer%start("kernel generic: double_hh_trafo_&
&MATH_DATATYPE&
&_generic" // &
&PRECISION_SUFFIX &
)
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
......@@ -149,11 +151,11 @@
endif
! call obj%timer%stop("kernel generic: double_hh_trafo_&
! &MATH_DATATYPE&
! &_generic" // &
! &PRECISION_SUFFIX &
! )
call obj%timer%stop("kernel generic: double_hh_trafo_&
&MATH_DATATYPE&
&_generic" // &
&PRECISION_SUFFIX &
)
end subroutine
......
......@@ -66,11 +66,12 @@
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (q, hh, nb, nq, ldq)
& (obj, q, hh, nb, nq, ldq)
use precision
use elpa_api
implicit none
class(elpa_t) :: obj
integer(kind=ik), intent(in) :: nb, nq, ldq
#ifdef USE_ASSUMED_SIZE
complex(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
......@@ -82,13 +83,13 @@
integer(kind=ik) :: i
complex(kind=C_DATATYPE_KIND) :: h1, tau1, x(nq)
!call obj%timer%start("kernel_&
!&MATH_DATATYPE&
!&_generic_simple: single_hh_trafo_&
!&MATH_DATATYPE&
!&_generic_simple" // &
!&PRECISION_SUFFIX &
!)
call obj%timer%start("kernel_&
&MATH_DATATYPE&
&_generic_simple: single_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple" // &
&PRECISION_SUFFIX &
)
! Just one Householder transformation
......@@ -108,13 +109,13 @@
enddo
!call obj%timer%stop("kernel_&
!&MATH_DATATYPE&
!&_generic_simple: single_hh_trafo_&
!&MATH_DATATYPE&
!&_generic_simple" // &
!&PRECISION_SUFFIX &
!)
call obj%timer%stop("kernel_&
&MATH_DATATYPE&
&_generic_simple: single_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple" // &
&PRECISION_SUFFIX &
)
end subroutine
......@@ -127,7 +128,7 @@
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (q, hh, nb, nq, ldq, ldh)
& (obj, q, hh, nb, nq, ldq, ldh)
#endif /* REALCASE == 1 */
......@@ -137,13 +138,15 @@
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (q, hh, nb, nq, ldq, ldh)
& (obj, q, hh, nb, nq, ldq, ldh)
#endif /* COMPLEXCASE==1 */
use precision
use elpa_api
implicit none
class(elpa_t) :: obj
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#if REALCASE==1
......@@ -170,13 +173,13 @@
#endif /* COMPLEXCASE==1 */
integer(kind=ik) :: i
!call obj%timer%start("kernel_&
!&MATH_DATATYPE&
!&_generic_simple: double_hh_trafo_&
!&MATH_DATATYPE&
!&_generic_simple" // &
!&PRECISION_SUFFIX &
!)
call obj%timer%start("kernel_&
&MATH_DATATYPE&
&_generic_simple: double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple" // &
&PRECISION_SUFFIX &
)
! Calculate dot product of the two Householder vectors
#if REALCASE==1
......@@ -246,12 +249,12 @@
q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1)
!call obj%timer%stop("kernel_&
!&MATH_DATATYPE&
!&_generic_simple: double_hh_trafo_&
!&MATH_DATATYPE&
!&_generic_simple" // &
!&PRECISION_SUFFIX &
!)
call obj%timer%stop("kernel_&
&MATH_DATATYPE&
&_generic_simple: double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple" // &
&PRECISION_SUFFIX &
)
end subroutine
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment