Commit b15d27f2 authored by Andreas Marek's avatar Andreas Marek

Unify pack_unpack_cpu

parent 9f7c384a
......@@ -27,10 +27,9 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \
src/mod_redist_band.F90 \
src/mod_pack_unpack_real.F90 \
src/mod_pack_unpack_cpu.F90 \
src/mod_compute_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_complex.F90 \
src/mod_pack_unpack_complex.F90 \
src/aligned_mem.F90 \
src/elpa1_compute_private.F90 \
src/elpa2_determine_workload.F90 \
......@@ -63,6 +62,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/pack_unpack_cpu.X90 \
src/redist_band.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
......@@ -981,6 +981,7 @@ EXTRA_DIST = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \
src/pack_unpack_cpu.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \
......
......@@ -45,14 +45,13 @@
use timings_dummy
#endif
use elpa2_workload
use pack_unpack_cpu
#if REALCASE == 1
use pack_unpack_real
use pack_unpack_real_gpu
use compute_hh_trafo_real
#endif
#if COMPLEXCASE == 1
use pack_unpack_complex
use compute_hh_trafo_complex
#endif
use cuda_functions
......
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
module pack_unpack_complex
#include "config-f90.h"
implicit none
#ifdef WITH_OPENMP
public pack_row_complex_cpu_openmp_double
#else
public pack_row_complex_cpu_double
#endif
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#ifdef WITH_OPENMP
public pack_row_complex_cpu_openmp_single
#else
public pack_row_complex_cpu_single
#endif
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
contains
#ifdef WITH_OPENMP
subroutine pack_row_complex_cpu_openmp_double(a, row, n, stripe_width, stripe_count, max_threads, thread_width, l_nev)
#else
subroutine pack_row_complex_cpu_double(a, row, n, stripe_width, last_stripe_width, stripe_count)
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
#ifdef WITH_OPENMP
integer(kind=ik), intent(in) :: stripe_width, stripe_count, max_threads, thread_width, l_nev
complex(kind=ck8), intent(in) :: a(:,:,:,:)
#else
integer(kind=ik), intent(in) :: stripe_width, last_stripe_width, stripe_count
complex(kind=ck8), intent(in) :: a(:,:,:)
#endif
complex(kind=ck8) :: row(:)
integer(kind=ik) :: n, i, noff, nl, nt
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%start("pack_row_complex_cpu_openmp_double")
#else
call timer%start("pack_row_complex_cpu_double")
#endif
#endif
#ifdef WITH_OPENMP
do nt = 1, max_threads
do i = 1, stripe_count
noff = (nt-1)*thread_width + (i-1)*stripe_width
nl = min(stripe_width, nt*thread_width-noff, l_nev-noff)
if (nl<=0) exit
row(noff+1:noff+nl) = a(1:nl,n,i,nt)
enddo
enddo
#else
do i=1,stripe_count
nl = merge(stripe_width, last_stripe_width, i<stripe_count)
noff = (i-1)*stripe_width
row(noff+1:noff+nl) = a(1:nl,n,i)
enddo
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%stop("pack_row_complex_cpu_openmp_double")
#else
call timer%stop("pack_row_complex_cpu_double")
#endif
#endif
#ifdef WITH_OPENMP
end subroutine pack_row_complex_cpu_openmp_double
#else
end subroutine pack_row_complex_cpu_double
#endif
#ifdef WITH_OPENMP
subroutine unpack_row_complex_cpu_openmp_double(a, row, n, my_thread, stripe_count, thread_width, stripe_width, l_nev)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer(kind=ik), intent(in) :: n, my_thread
integer(kind=ik), intent(in) :: stripe_count, thread_width, stripe_width, l_nev
complex(kind=ck8), intent(in) :: row(:)
complex(kind=ck8) :: a(:,:,:,:)
integer(kind=ik) :: i, noff, nl
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("unpack_row_complex_cpu_openmp_double")
#endif
do i=1,stripe_count
noff = (my_thread-1)*thread_width + (i-1)*stripe_width
nl = min(stripe_width, my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) exit
a(1:nl,n,i,my_thread) = row(noff+1:noff+nl)
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("unpack_row_complex_cpu_openmp_double")
#endif
end subroutine unpack_row_complex_cpu_openmp_double
#else /* WITH_OPENMP */
subroutine unpack_row_complex_cpu_double(a, row, n, stripe_count, stripe_width, last_stripe_width)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, n
complex(kind=ck8), intent(in) :: row(:)
complex(kind=ck8) :: a(:,:,:)
integer(kind=ik) :: i, noff, nl
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("unpack_row_complex_cpu_double")
#endif
do i=1,stripe_count
nl = merge(stripe_width, last_stripe_width, i<stripe_count)
noff = (i-1)*stripe_width
a(1:nl,n,i) = row(noff+1:noff+nl)
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("unpack_row_complex_cpu_double")
#endif
end subroutine unpack_row_complex_cpu_double
#endif /* WITH_OPENMP */
#ifdef WANT_SINGLE_PRECISION_COMPLEX
! single precision implementation, at the moment duplivated !!
#ifdef WITH_OPENMP
subroutine pack_row_complex_cpu_openmp_single(a, row, n, stripe_width, stripe_count, max_threads, thread_width, l_nev)
#else
subroutine pack_row_complex_cpu_single(a, row, n, stripe_width, last_stripe_width, stripe_count)
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
#ifdef WITH_OPENMP
integer(kind=ik), intent(in) :: stripe_width, stripe_count, max_threads, thread_width, l_nev
complex(kind=ck4), intent(in) :: a(:,:,:,:)
#else
integer(kind=ik), intent(in) :: stripe_width, last_stripe_width, stripe_count
complex(kind=ck4), intent(in) :: a(:,:,:)
#endif
complex(kind=ck4) :: row(:)
integer(kind=ik) :: n, i, noff, nl, nt
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%start("pack_row_complex_cpu_openmp_single")
#else
call timer%start("pack_row_complex_cpu_single")
#endif
#endif
#ifdef WITH_OPENMP
do nt = 1, max_threads
do i = 1, stripe_count
noff = (nt-1)*thread_width + (i-1)*stripe_width
nl = min(stripe_width, nt*thread_width-noff, l_nev-noff)
if (nl<=0) exit
row(noff+1:noff+nl) = a(1:nl,n,i,nt)
enddo
enddo
#else
do i=1,stripe_count
nl = merge(stripe_width, last_stripe_width, i<stripe_count)
noff = (i-1)*stripe_width
row(noff+1:noff+nl) = a(1:nl,n,i)
enddo
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%stop("pack_row_complex_cpu_openmp_single")
#else
call timer%stop("pack_row_complex_cpu_single")
#endif
#endif
#ifdef WITH_OPENMP
end subroutine pack_row_complex_cpu_openmp_single
#else
end subroutine pack_row_complex_cpu_single
#endif
#ifdef WITH_OPENMP
subroutine unpack_row_complex_cpu_openmp_single(a, row, n, my_thread, stripe_count, thread_width, stripe_width, l_nev)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer(kind=ik), intent(in) :: n, my_thread
integer(kind=ik), intent(in) :: stripe_count, thread_width, stripe_width, l_nev
complex(kind=ck4), intent(in) :: row(:)
complex(kind=ck4) :: a(:,:,:,:)
integer(kind=ik) :: i, noff, nl
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("unpack_row_complex_cpu_openmp_single")
#endif
do i=1,stripe_count
noff = (my_thread-1)*thread_width + (i-1)*stripe_width
nl = min(stripe_width, my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) exit
a(1:nl,n,i,my_thread) = row(noff+1:noff+nl)
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("unpack_row_complex_cpu_openmp_single")
#endif
end subroutine unpack_row_complex_cpu_openmp_single
#else /* WITH_OPENMP */
subroutine unpack_row_complex_cpu_single(a, row, n, stripe_count, stripe_width, last_stripe_width)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, n
complex(kind=ck4), intent(in) :: row(:)
complex(kind=ck4) :: a(:,:,:)
integer(kind=ik) :: i, noff, nl
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("unpack_row_complex_cpu_single")
#endif
do i=1,stripe_count
nl = merge(stripe_width, last_stripe_width, i<stripe_count)
noff = (i-1)*stripe_width
a(1:nl,n,i) = row(noff+1:noff+nl)
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("unpack_row_complex_cpu_single")
#endif
end subroutine unpack_row_complex_cpu_single
#endif /* WITH_OPENMP */
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
end module
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
module pack_unpack_cpu
#include "config-f90.h"
implicit none
private
#ifdef WITH_OPENMP
public pack_row_real_cpu_openmp_double, unpack_row_real_cpu_openmp_double
public pack_row_complex_cpu_openmp_double, unpack_row_complex_cpu_openmp_double
#else
public pack_row_real_cpu_double, unpack_row_real_cpu_double
public pack_row_complex_cpu_double, unpack_row_complex_cpu_double
#endif
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#ifdef WITH_OPENMP
public pack_row_real_cpu_openmp_single, unpack_row_real_cpu_openmp_single
public pack_row_complex_cpu_openmp_single, unpack_row_complex_cpu_openmp_single
#else
public pack_row_real_cpu_single, unpack_row_real_cpu_single
public pack_row_complex_cpu_single, unpack_row_complex_cpu_single
#endif
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
contains
!real double precision
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "pack_unpack_cpu.X90"
#undef REALCASE
#undef DOUBLE_PRECISION
! real single precision
#if defined(WANT_SINGLE_PRECISION_REAL)
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "pack_unpack_cpu.X90"
#undef REALCASE
#undef SINGLE_PRECISION
#endif
!complex double precision
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "pack_unpack_cpu.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
! complex single precision
#if defined(WANT_SINGLE_PRECISION_COMPLEX)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "pack_unpack_cpu.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif
end module
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
......@@ -40,52 +41,73 @@
! the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#endif
module pack_unpack_real
#include "config-f90.h"
implicit none
subroutine pack_row_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP
public pack_row_real_cpu_openmp_double, unpack_row_real_cpu_openmp_double
&_cpu_openmp_&
#else
public pack_row_real_cpu_double, unpack_row_real_cpu_double
&_cpu_&
#endif
contains
&PRECISION &
(a, row, n, stripe_width, &
#ifdef WITH_OPENMP
subroutine pack_row_real_cpu_openmp_double(a, row, n, stripe_width, stripe_count, max_threads, thread_width, l_nev)
stripe_count, max_threads, thread_width, l_nev)
#else
subroutine pack_row_real_cpu_double(a, row, n, stripe_width, last_stripe_width, stripe_count)
last_stripe_width, stripe_count)
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: n, stripe_count, stripe_width
#ifdef WITH_OPENMP
integer(kind=ik), intent(in) :: max_threads, thread_width, l_nev
real(kind=rk8), intent(in) :: a(:,:,:,:)
#else
#if REALCASE == 1
real(kind=C_DATATYPE_KIND), intent(in) :: a(:,:,:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND), intent(in) :: a(:,:,:,:)
#endif
#else /* WITH_OPENMP */
integer(kind=ik), intent(in) :: last_stripe_width
real(kind=rk8), intent(in) :: a(:,:,:)
#if REALCASE == 1
real(kind=C_DATATYPE_KIND), intent(in) :: a(:,:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND), intent(in) :: a(:,:,:)
#endif
#endif /* WITH_OPENMP */
#if REALCASE == 1
real(kind=C_DATATYPE_KIND) :: row(:)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND) :: row(:)
#endif
real(kind=rk8) :: row(:)
integer(kind=ik) :: i, noff, nl
#ifdef WITH_OPENMP
integer(kind=ik) :: nt
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("pack_row_&
&MATH_DATATPE&
#ifdef WITH_OPENMP
call timer%start("pack_row_real_cpu_openmp_double")
&_cpu_openmp" // &
#else
call timer%start("pack_row_real_cpu_double")
#endif
&_cpu" // &
#endif
&PRECISION_SUFFIX &
)
#ifdef WITH_OPENMP
do nt = 1, max_threads
......@@ -104,212 +126,113 @@ module pack_unpack_real
enddo
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("pack_row_&
&MATH_DATATPE&
#ifdef WITH_OPENMP
call timer%stop("pack_row_real_cpu_openmp_double")
&_cpu_openmp" // &
#else
call timer%stop("pack_row_real_cpu_double")
#endif
&_cpu" // &
#endif
&PRECISION_SUFFIX &
)
end subroutine
subroutine unpack_row_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP
end subroutine pack_row_real_cpu_openmp_double
&_cpu_openmp_&
#else
end subroutine pack_row_real_cpu_double
&_cpu_&
#endif
&PRECISION &
(a, row, n, &
#ifdef WITH_OPENMP
subroutine unpack_row_real_cpu_openmp_double(a, row, n, my_thread, stripe_count, thread_width, stripe_width, l_nev)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer(kind=ik), intent(in) :: stripe_count, thread_width, stripe_width, l_nev
real(kind=rk8) :: a(:,:,:,:)
integer(kind=ik), intent(in) :: n, my_thread
real(kind=rk8), intent(in) :: row(:)
integer(kind=ik) :: i, noff, nl
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("unpack_row_real_cpu_openmp_double")
#endif
do i=1,stripe_count
noff = (my_thread-1)*thread_width + (i-1)*stripe_width
nl = min(stripe_width, my_thread*thread_width-noff, l_nev-noff)
if(nl<=0) exit
a(1:nl,n,i,my_thread) = row(noff+1:noff+nl)
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("unpack_row_real_cpu_openmp_double")
#endif
end subroutine unpack_row_real_cpu_openmp_double
#else /* WITH_OPENMP */
subroutine unpack_row_real_cpu_double(a, row, n, stripe_count, stripe_width, last_stripe_width)
#ifdef HAVE_DETAILED_TIMINGS
use timings
my_thread, &
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: n, stripe_count, stripe_width, last_stripe_width
real(kind=rk8) :: row(:)
real(kind=rk8) :: a(:,:,:)
integer(kind=ik) :: i, noff, nl
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("unpack_row_real_cpu_double")