diff --git a/Makefile.am b/Makefile.am
index 64cdd9a35fc04dd7db6a5e26e47441a023f2110b..ded4536e15583fe7e1a1fe37823f13c8fcbe2d56 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -27,10 +27,9 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \
src/mod_redist_band.F90 \
- src/mod_pack_unpack_real.F90 \
+ src/mod_pack_unpack_cpu.F90 \
src/mod_compute_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_complex.F90 \
- src/mod_pack_unpack_complex.F90 \
src/aligned_mem.F90 \
src/elpa1_compute_private.F90 \
src/elpa2_determine_workload.F90 \
@@ -63,6 +62,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
+ src/pack_unpack_cpu.X90 \
src/redist_band.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
@@ -981,6 +981,7 @@ EXTRA_DIST = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \
+ src/pack_unpack_cpu.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \
diff --git a/src/elpa2_trans_ev_tridi_to_band_template.X90 b/src/elpa2_trans_ev_tridi_to_band_template.X90
index 163d3fc2c5844b638b0bf772ceafb14b31564200..6243bbad70a977b9e815238438255d400d8ce446 100644
--- a/src/elpa2_trans_ev_tridi_to_band_template.X90
+++ b/src/elpa2_trans_ev_tridi_to_band_template.X90
@@ -45,14 +45,13 @@
use timings_dummy
#endif
use elpa2_workload
+ use pack_unpack_cpu
#if REALCASE == 1
- use pack_unpack_real
use pack_unpack_real_gpu
use compute_hh_trafo_real
#endif
#if COMPLEXCASE == 1
- use pack_unpack_complex
use compute_hh_trafo_complex
#endif
use cuda_functions
diff --git a/src/mod_pack_unpack_complex.F90 b/src/mod_pack_unpack_complex.F90
deleted file mode 100644
index 0e7c66f72b09029f9ce3c876d44012a6082c18f3..0000000000000000000000000000000000000000
--- a/src/mod_pack_unpack_complex.F90
+++ /dev/null
@@ -1,309 +0,0 @@
-! This file is part of ELPA.
-!
-! The ELPA library was originally created by the ELPA consortium,
-! consisting of the following organizations:
-!
-! - Max Planck Computing and Data Facility (MPCDF), formerly known as
-! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-! Informatik,
-! - Technische Universität München, Lehrstuhl für Informatik mit
-! Schwerpunkt Wissenschaftliches Rechnen ,
-! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-! and
-! - IBM Deutschland GmbH
-!
-!
-! More information can be found here:
-! http://elpa.mpcdf.mpg.de/
-!
-! ELPA is free software: you can redistribute it and/or modify
-! it under the terms of the version 3 of the license of the
-! GNU Lesser General Public License as published by the Free
-! Software Foundation.
-!
-! ELPA is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-! GNU Lesser General Public License for more details.
-!
-! You should have received a copy of the GNU Lesser General Public License
-! along with ELPA. If not, see
-!
-! ELPA reflects a substantial effort on the part of the original
-! ELPA consortium, and we ask you to respect the spirit of the
-! license that we chose: i.e., please contribute any changes you
-! may have back to the original ELPA library distribution, and keep
-! any derivatives of ELPA under the same license that we chose for
-! the original distribution, the GNU Lesser General Public License.
-!
-! This file was written by A. Marek, MPCDF
-
-module pack_unpack_complex
-#include "config-f90.h"
- implicit none
-
-#ifdef WITH_OPENMP
- public pack_row_complex_cpu_openmp_double
-#else
- public pack_row_complex_cpu_double
-#endif
-
-#ifdef WANT_SINGLE_PRECISION_COMPLEX
-
-#ifdef WITH_OPENMP
- public pack_row_complex_cpu_openmp_single
-#else
- public pack_row_complex_cpu_single
-#endif
-
-#endif /* WANT_SINGLE_PRECISION_COMPLEX */
-
- contains
-#ifdef WITH_OPENMP
- subroutine pack_row_complex_cpu_openmp_double(a, row, n, stripe_width, stripe_count, max_threads, thread_width, l_nev)
-#else
- subroutine pack_row_complex_cpu_double(a, row, n, stripe_width, last_stripe_width, stripe_count)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
- use precision
- implicit none
-#ifdef WITH_OPENMP
- integer(kind=ik), intent(in) :: stripe_width, stripe_count, max_threads, thread_width, l_nev
- complex(kind=ck8), intent(in) :: a(:,:,:,:)
-#else
- integer(kind=ik), intent(in) :: stripe_width, last_stripe_width, stripe_count
- complex(kind=ck8), intent(in) :: a(:,:,:)
-#endif
- complex(kind=ck8) :: row(:)
- integer(kind=ik) :: n, i, noff, nl, nt
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
- call timer%start("pack_row_complex_cpu_openmp_double")
-#else
- call timer%start("pack_row_complex_cpu_double")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
- do nt = 1, max_threads
- do i = 1, stripe_count
- noff = (nt-1)*thread_width + (i-1)*stripe_width
- nl = min(stripe_width, nt*thread_width-noff, l_nev-noff)
- if (nl<=0) exit
- row(noff+1:noff+nl) = a(1:nl,n,i,nt)
- enddo
- enddo
-#else
- do i=1,stripe_count
- nl = merge(stripe_width, last_stripe_width, i
+!
+! ELPA reflects a substantial effort on the part of the original
+! ELPA consortium, and we ask you to respect the spirit of the
+! license that we chose: i.e., please contribute any changes you
+! may have back to the original ELPA library distribution, and keep
+! any derivatives of ELPA under the same license that we chose for
+! the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+module pack_unpack_cpu
+#include "config-f90.h"
+ implicit none
+
+ private
+
+#ifdef WITH_OPENMP
+ public pack_row_real_cpu_openmp_double, unpack_row_real_cpu_openmp_double
+ public pack_row_complex_cpu_openmp_double, unpack_row_complex_cpu_openmp_double
+#else
+ public pack_row_real_cpu_double, unpack_row_real_cpu_double
+ public pack_row_complex_cpu_double, unpack_row_complex_cpu_double
+#endif
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+#ifdef WITH_OPENMP
+ public pack_row_real_cpu_openmp_single, unpack_row_real_cpu_openmp_single
+ public pack_row_complex_cpu_openmp_single, unpack_row_complex_cpu_openmp_single
+#else
+ public pack_row_real_cpu_single, unpack_row_real_cpu_single
+ public pack_row_complex_cpu_single, unpack_row_complex_cpu_single
+#endif
+
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+ contains
+
+ !real double precision
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "precision_macros.h"
+#include "pack_unpack_cpu.X90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+ ! real single precision
+#if defined(WANT_SINGLE_PRECISION_REAL)
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "precision_macros.h"
+#include "pack_unpack_cpu.X90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif
+
+ !complex double precision
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "precision_macros.h"
+#include "pack_unpack_cpu.X90"
+#undef COMPLEXCASE
+#undef DOUBLE_PRECISION
+
+ ! complex single precision
+#if defined(WANT_SINGLE_PRECISION_COMPLEX)
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "precision_macros.h"
+#include "pack_unpack_cpu.X90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+#endif
+
+
+end module
diff --git a/src/mod_pack_unpack_real.F90 b/src/mod_pack_unpack_real.F90
deleted file mode 100644
index c0a5ba66749aa60ed69fcfabbe51ebb5756bf293..0000000000000000000000000000000000000000
--- a/src/mod_pack_unpack_real.F90
+++ /dev/null
@@ -1,315 +0,0 @@
-! This file is part of ELPA.
-!
-! The ELPA library was originally created by the ELPA consortium,
-! consisting of the following organizations:
-!
-! - Max Planck Computing and Data Facility (MPCDF), formerly known as
-! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-! Informatik,
-! - Technische Universität München, Lehrstuhl für Informatik mit
-! Schwerpunkt Wissenschaftliches Rechnen ,
-! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-! and
-! - IBM Deutschland GmbH
-!
-!
-! More information can be found here:
-! http://elpa.mpcdf.mpg.de/
-!
-! ELPA is free software: you can redistribute it and/or modify
-! it under the terms of the version 3 of the license of the
-! GNU Lesser General Public License as published by the Free
-! Software Foundation.
-!
-! ELPA is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-! GNU Lesser General Public License for more details.
-!
-! You should have received a copy of the GNU Lesser General Public License
-! along with ELPA. If not, see
-!
-! ELPA reflects a substantial effort on the part of the original
-! ELPA consortium, and we ask you to respect the spirit of the
-! license that we chose: i.e., please contribute any changes you
-! may have back to the original ELPA library distribution, and keep
-! any derivatives of ELPA under the same license that we chose for
-! the original distribution, the GNU Lesser General Public License.
-!
-! This file was written by A. Marek, MPCDF
-
-module pack_unpack_real
-#include "config-f90.h"
- implicit none
-
-#ifdef WITH_OPENMP
- public pack_row_real_cpu_openmp_double, unpack_row_real_cpu_openmp_double
-#else
- public pack_row_real_cpu_double, unpack_row_real_cpu_double
-#endif
- contains
-
-#ifdef WITH_OPENMP
- subroutine pack_row_real_cpu_openmp_double(a, row, n, stripe_width, stripe_count, max_threads, thread_width, l_nev)
-#else
- subroutine pack_row_real_cpu_double(a, row, n, stripe_width, last_stripe_width, stripe_count)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
- use precision
- implicit none
- integer(kind=ik), intent(in) :: n, stripe_count, stripe_width
-#ifdef WITH_OPENMP
- integer(kind=ik), intent(in) :: max_threads, thread_width, l_nev
- real(kind=rk8), intent(in) :: a(:,:,:,:)
-#else
- integer(kind=ik), intent(in) :: last_stripe_width
- real(kind=rk8), intent(in) :: a(:,:,:)
-#endif
- real(kind=rk8) :: row(:)
-
- integer(kind=ik) :: i, noff, nl
-#ifdef WITH_OPENMP
- integer(kind=ik) :: nt
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
- call timer%start("pack_row_real_cpu_openmp_double")
-
-#else
- call timer%start("pack_row_real_cpu_double")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
- do nt = 1, max_threads
- do i = 1, stripe_count
- noff = (nt-1)*thread_width + (i-1)*stripe_width
- nl = min(stripe_width, nt*thread_width-noff, l_nev-noff)
- if (nl<=0) exit
- row(noff+1:noff+nl) = a(1:nl,n,i,nt)
- enddo
- enddo
-#else
- do i=1,stripe_count
- nl = merge(stripe_width, last_stripe_width, i
+!
+! ELPA reflects a substantial effort on the part of the original
+! ELPA consortium, and we ask you to respect the spirit of the
+! license that we chose: i.e., please contribute any changes you
+! may have back to the original ELPA library distribution, and keep
+! any derivatives of ELPA under the same license that we chose for
+! the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+#endif
+
+ subroutine pack_row_&
+ &MATH_DATATYPE&
+#ifdef WITH_OPENMP
+ &_cpu_openmp_&
+#else
+ &_cpu_&
+#endif
+ &PRECISION &
+ (a, row, n, stripe_width, &
+#ifdef WITH_OPENMP
+ stripe_count, max_threads, thread_width, l_nev)
+#else
+ last_stripe_width, stripe_count)
+#endif
+
+#ifdef HAVE_DETAILED_TIMINGS
+ use timings
+#else
+ use timings_dummy
+#endif
+ use precision
+ implicit none
+ integer(kind=ik), intent(in) :: n, stripe_count, stripe_width
+#ifdef WITH_OPENMP
+ integer(kind=ik), intent(in) :: max_threads, thread_width, l_nev
+
+#if REALCASE == 1
+ real(kind=C_DATATYPE_KIND), intent(in) :: a(:,:,:,:)
+#endif
+#if COMPLEXCASE == 1
+ complex(kind=C_DATATYPE_KIND), intent(in) :: a(:,:,:,:)
+#endif
+
+#else /* WITH_OPENMP */
+ integer(kind=ik), intent(in) :: last_stripe_width
+#if REALCASE == 1
+ real(kind=C_DATATYPE_KIND), intent(in) :: a(:,:,:)
+#endif
+#if COMPLEXCASE == 1
+ complex(kind=C_DATATYPE_KIND), intent(in) :: a(:,:,:)
+#endif
+
+#endif /* WITH_OPENMP */
+
+#if REALCASE == 1
+ real(kind=C_DATATYPE_KIND) :: row(:)
+#endif
+#if COMPLEXCASE == 1
+ complex(kind=C_DATATYPE_KIND) :: row(:)
+#endif
+
+ integer(kind=ik) :: i, noff, nl
+#ifdef WITH_OPENMP
+ integer(kind=ik) :: nt
+#endif
+
+ call timer%start("pack_row_&
+ &MATH_DATATPE&
+#ifdef WITH_OPENMP
+ &_cpu_openmp" // &
+#else
+ &_cpu" // &
+#endif
+ &PRECISION_SUFFIX &
+ )
+
+#ifdef WITH_OPENMP
+ do nt = 1, max_threads
+ do i = 1, stripe_count
+ noff = (nt-1)*thread_width + (i-1)*stripe_width
+ nl = min(stripe_width, nt*thread_width-noff, l_nev-noff)
+ if (nl<=0) exit
+ row(noff+1:noff+nl) = a(1:nl,n,i,nt)
+ enddo
+ enddo
+#else
+ do i=1,stripe_count
+ nl = merge(stripe_width, last_stripe_width, i